| @@ -1369,6 +1369,7 @@ type ( | |||
| Code int `json:"code,omitempty"` | |||
| Msg string `json:"msg,omitempty"` | |||
| Data interface{} `json:"data,omitempty"` | |||
| TraceId string `json:"traceId,omitempty"` | |||
| } | |||
| ) | |||
| @@ -111,8 +111,9 @@ type ( | |||
| ) | |||
| type cancelJobReq { | |||
| ClusterId int64 `form:"clusterId"` | |||
| JobId string `form:"jobId"` | |||
| ClusterId int64 `form:"clusterId,optional"` | |||
| JobId string `form:"jobId,optional"` | |||
| taskId string `form:"taskId"` | |||
| } | |||
| type jobInfoReq { | |||
| @@ -174,4 +175,18 @@ type ( | |||
| } | |||
| /******************instance center*************************/ | |||
| ) | |||
| type( | |||
| HpcTaskLogResp{ | |||
| Code int32 `json:"code"` | |||
| Msg string `json:"msg"` | |||
| Data interface{} `json:"data"` | |||
| } | |||
| ) | |||
| type( | |||
| HpcTaskLogReq{ | |||
| TaskId string `path:"taskId"` | |||
| } | |||
| ) | |||
| @@ -13,18 +13,18 @@ import ( | |||
| "inference/inference.api" | |||
| ) | |||
| info( | |||
| title: "pcm api service" | |||
| desc: "type desc here" | |||
| author: "type author here" | |||
| email: "type email here" | |||
| info ( | |||
| title: "pcm api service" | |||
| desc: "type desc here" | |||
| author: "type author here" | |||
| email: "type email here" | |||
| version: "type version here" | |||
| ) | |||
| //core端接口 | |||
| @server( | |||
| @server ( | |||
| prefix: pcm/v1 | |||
| group: core | |||
| group: core | |||
| ) | |||
| service pcm { | |||
| @doc "查询P端服务列表" | |||
| @@ -177,9 +177,9 @@ service pcm { | |||
| } | |||
| //hpc二级接口 | |||
| @server( | |||
| @server ( | |||
| prefix: pcm/v1 | |||
| group: hpc | |||
| group: hpc | |||
| ) | |||
| service pcm { | |||
| @doc "提交超算任务" | |||
| @@ -216,13 +216,17 @@ service pcm { | |||
| @doc "查询超算应用中心列表" | |||
| @handler ListInstanceCenter | |||
| get /hpc/ListInstanceCenter(HpcInstanceCenterReq) returns (PageResult) | |||
| get /hpc/ListInstanceCenter (HpcInstanceCenterReq) returns (PageResult) | |||
| @doc "超算任务日志" | |||
| @handler getHpcTaskLogHandler | |||
| get /hpc/jobLogs/:taskId (HpcTaskLogReq) returns (HpcTaskLogResp) | |||
| } | |||
| //cloud二级接口 | |||
| @server( | |||
| @server ( | |||
| prefix: pcm/v1 | |||
| group: cloud | |||
| group: cloud | |||
| ) | |||
| service pcm { | |||
| @doc "云算任务列表" | |||
| @@ -258,9 +262,9 @@ service pcm { | |||
| } | |||
| //智算二级接口 | |||
| @server( | |||
| @server ( | |||
| prefix: pcm/v1 | |||
| group: ai | |||
| group: ai | |||
| ) | |||
| service pcm { | |||
| @doc "训练任务统计" | |||
| @@ -407,13 +411,13 @@ service pcm { | |||
| @doc "文本识别" | |||
| @handler ChatHandler | |||
| post /ai/chat (ChatReq) returns (ChatResult) | |||
| /******chat end***********/ | |||
| /******chat end***********/ | |||
| } | |||
| //screen接口 | |||
| @server( | |||
| @server ( | |||
| prefix: pcm/v1 | |||
| group: storage | |||
| group: storage | |||
| ) | |||
| service pcm { | |||
| @doc "日常算力查询" | |||
| @@ -426,9 +430,9 @@ service pcm { | |||
| } | |||
| //openstack 接口 | |||
| @server( | |||
| @server ( | |||
| prefix: pcm/v1 | |||
| group: vm | |||
| group: vm | |||
| ) | |||
| service pcm { | |||
| @doc "openstack计算中心概览" | |||
| @@ -825,9 +829,9 @@ service pcm { | |||
| } | |||
| //存算联动 接口 | |||
| @server( | |||
| @server ( | |||
| prefix: pcm/v1 | |||
| group: storelink | |||
| group: storelink | |||
| ) | |||
| service pcm { | |||
| @handler UploadLinkImageHandler | |||
| @@ -856,9 +860,9 @@ service pcm { | |||
| } | |||
| // 接口 | |||
| @server( | |||
| @server ( | |||
| prefix: pcm/v1 | |||
| group: adapters | |||
| group: adapters | |||
| ) | |||
| service pcm { | |||
| @handler AdaptersListHandler | |||
| @@ -901,9 +905,9 @@ service pcm { | |||
| get /adapter/getAdapterInfo (adapterInfoNameReq) returns (adapterInfoNameReqResp) | |||
| } | |||
| @server( | |||
| @server ( | |||
| prefix: pcm/v1 | |||
| group: schedule | |||
| group: schedule | |||
| ) | |||
| service pcm { | |||
| @handler ScheduleGetAiResourceTypesHandler | |||
| @@ -955,9 +959,9 @@ service pcm { | |||
| post /schedule/cancelTask (CancelTaskReq) returns (CancelTaskResp) | |||
| } | |||
| @server( | |||
| @server ( | |||
| prefix: pcm/v1 | |||
| group: inference | |||
| group: inference | |||
| ) | |||
| service pcm { | |||
| @handler GetDeployInstanceHandler | |||
| @@ -1018,9 +1022,9 @@ service pcm { | |||
| get /inference/getAdaptersByModel (GetAdaptersByModelReq) returns (GetAdaptersByModelResp) | |||
| } | |||
| @server( | |||
| @server ( | |||
| prefix: pcm/v1 | |||
| group: dictionary | |||
| group: dictionary | |||
| ) | |||
| service pcm { | |||
| @handler GetDict | |||
| @@ -1057,9 +1061,9 @@ service pcm { | |||
| get /dictItem/code/:dictCode (DictCodeReq) returns (PageResult) | |||
| } | |||
| @server( | |||
| @server ( | |||
| prefix: pcm/v1 | |||
| group: monitoring | |||
| group: monitoring | |||
| ) | |||
| service pcm { | |||
| @handler CreateAlertRuleHandler | |||
| @@ -1096,4 +1100,5 @@ service pcm { | |||
| @handler scheduleSituationHandler | |||
| get /monitoring/schedule/situation returns (scheduleSituationResp) | |||
| } | |||
| } | |||
| @@ -0,0 +1,24 @@ | |||
| package hpc | |||
| import ( | |||
| "github.com/zeromicro/go-zero/rest/httpx" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/logic/hpc" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" | |||
| "net/http" | |||
| ) | |||
| func GetHpcTaskLogHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { | |||
| return func(w http.ResponseWriter, r *http.Request) { | |||
| var req types.HpcTaskLogReq | |||
| if err := httpx.Parse(r, &req); err != nil { | |||
| result.ParamErrorResult(r, w, err) | |||
| return | |||
| } | |||
| l := hpc.NewGetHpcTaskLogLogic(r.Context(), svcCtx) | |||
| resp, err := l.GetHpcTaskLog(&req) | |||
| result.HttpResult(r, w, resp, err) | |||
| } | |||
| } | |||
| @@ -2,54 +2,101 @@ package hpc | |||
| import ( | |||
| "context" | |||
| "github.com/go-resty/resty/v2" | |||
| "github.com/pkg/errors" | |||
| "fmt" | |||
| "github.com/zeromicro/go-zero/core/logx" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types" | |||
| "gitlink.org.cn/JointCloud/pcm-hpc/slurm" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" | |||
| ) | |||
| type CancelJobLogic struct { | |||
| logx.Logger | |||
| ctx context.Context | |||
| svcCtx *svc.ServiceContext | |||
| ctx context.Context | |||
| svcCtx *svc.ServiceContext | |||
| hpcService *service.HpcService | |||
| } | |||
| type TaskHPCResult struct { | |||
| ID uint `gorm:"column:id"` // 对应 t.id | |||
| JobID string `gorm:"column:job_id"` // 对应 hpc.job_id | |||
| AdapterId string `gorm:"column:adapter_id"` // 对应 hpc.adapter_id | |||
| } | |||
| func NewCancelJobLogic(ctx context.Context, svcCtx *svc.ServiceContext) *CancelJobLogic { | |||
| cache := make(map[string]interface{}, 10) | |||
| hpcService, err := service.NewHpcService(&svcCtx.Config, svcCtx.Scheduler.HpcStorages, cache) | |||
| if err != nil { | |||
| return nil | |||
| } | |||
| return &CancelJobLogic{ | |||
| Logger: logx.WithContext(ctx), | |||
| ctx: ctx, | |||
| svcCtx: svcCtx, | |||
| Logger: logx.WithContext(ctx), | |||
| ctx: ctx, | |||
| svcCtx: svcCtx, | |||
| hpcService: hpcService, | |||
| } | |||
| } | |||
| func (l *CancelJobLogic) CancelJob(req *types.CancelJobReq) error { | |||
| var clusterInfo *types.ClusterInfo | |||
| tx := l.svcCtx.DbEngin.Raw("select * from t_cluster where id = ?", req.ClusterId).Scan(&clusterInfo) | |||
| if tx.Error != nil { | |||
| return tx.Error | |||
| } | |||
| // 查询p端调用地址 | |||
| var adapterAddress string | |||
| l.svcCtx.DbEngin.Raw("SELECT server FROM `t_adapter` where id = ?", clusterInfo.AdapterId).Scan(&adapterAddress) | |||
| var jobResp slurm.GetJobResp | |||
| httpClient := resty.New().R() | |||
| _, err := httpClient.SetHeader("Content-Type", "application/json"). | |||
| SetQueryParams(map[string]string{ | |||
| "jobId": req.JobId, | |||
| "server": clusterInfo.Server, | |||
| "version": clusterInfo.Version, | |||
| "token": clusterInfo.Token, | |||
| "username": clusterInfo.Username, | |||
| }). | |||
| SetResult(&jobResp). | |||
| Delete(adapterAddress + "/api/v1/job/cancel") | |||
| //var clusterInfo *types.ClusterInfo | |||
| //tx := l.svcCtx.DbEngin.Raw("select * from t_cluster where id = ?", req.ClusterId).Scan(&clusterInfo) | |||
| //if tx.Error != nil { | |||
| // return tx.Error | |||
| //} | |||
| //// 查询p端调用地址 | |||
| //var adapterAddress string | |||
| //l.svcCtx.DbEngin.Raw("SELECT server FROM `t_adapter` where id = ?", clusterInfo.AdapterId).Scan(&adapterAddress) | |||
| //var jobResp slurm.GetJobResp | |||
| //httpClient := resty.New().R() | |||
| //_, err := httpClient.SetHeader("Content-Type", "application/json"). | |||
| // SetQueryParams(map[string]string{ | |||
| // "jobId": req.JobId, | |||
| // "server": clusterInfo.Server, | |||
| // "version": clusterInfo.Version, | |||
| // "token": clusterInfo.Token, | |||
| // "username": clusterInfo.Username, | |||
| // }). | |||
| // SetResult(&jobResp). | |||
| // Delete(adapterAddress + "/api/v1/job/cancel") | |||
| //if err != nil { | |||
| // return err | |||
| //} | |||
| //if len(jobResp.Errors) != 0 { | |||
| // return errors.Errorf(jobResp.Errors[0].Description) | |||
| //} | |||
| //return nil | |||
| var hpcR TaskHPCResult | |||
| tx := l.svcCtx.DbEngin.Raw( | |||
| "SELECT t.id, hpc.job_id ,hpc.adapter_id FROM task t "+ | |||
| "INNER JOIN task_hpc hpc ON t.id = hpc.task_id "+ | |||
| "WHERE adapter_type_dict = 2 AND t.id = ?", | |||
| req.TaskId, | |||
| ).Scan(&hpcR).Error | |||
| if tx != nil { | |||
| return fmt.Errorf("数据库查询失败: %v", tx.Error) | |||
| } | |||
| if hpcR.ID == 0 || hpcR.JobID == "" { | |||
| return fmt.Errorf("作业不存在") | |||
| } | |||
| var adapterInfo types.AdapterInfo | |||
| l.svcCtx.DbEngin.Raw("SELECT * FROM `t_adapter` where id = ?", hpcR.AdapterId).Scan(&adapterInfo) | |||
| if adapterInfo.Id == "" { | |||
| return fmt.Errorf("adapter not found") | |||
| } | |||
| // 取消作业 | |||
| err := l.hpcService.HpcExecutorAdapterMap[adapterInfo.Id].CancelTask(l.ctx, hpcR.JobID) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| if len(jobResp.Errors) != 0 { | |||
| return errors.Errorf(jobResp.Errors[0].Description) | |||
| // 更新数据库状态 | |||
| tx = l.svcCtx.DbEngin.Model(&types.Task{}).Where("id = ?", hpcR.ID).Update("status", "Canceled").Error | |||
| if tx != nil { | |||
| return fmt.Errorf("数据库更新失败: %v", tx.Error) | |||
| } | |||
| // 更新数据库状态 | |||
| tx = l.svcCtx.DbEngin.Model(&models.TaskHpc{}).Where("task_id = ?", hpcR.ID).Update("status", "Canceled").Error | |||
| if tx != nil { | |||
| return fmt.Errorf("数据库更新失败: %v", tx.Error) | |||
| } | |||
| return nil | |||
| } | |||
| @@ -155,6 +155,7 @@ func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *t | |||
| return nil, updates.Error | |||
| } | |||
| resp.Data.JobInfo["taskId"] = strconv.FormatInt(taskModel.Id, 10) | |||
| logx.Infof("提交job到指定集群成功, resp: %v", resp) | |||
| return resp, nil | |||
| } | |||
| @@ -0,0 +1,59 @@ | |||
| package hpc | |||
| import ( | |||
| "context" | |||
| "fmt" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types" | |||
| "github.com/zeromicro/go-zero/core/logx" | |||
| ) | |||
| type GetHpcTaskLogLogic struct { | |||
| logx.Logger | |||
| ctx context.Context | |||
| svcCtx *svc.ServiceContext | |||
| hpcService *service.HpcService | |||
| } | |||
| // 超算任务日志 | |||
| func NewGetHpcTaskLogLogic(ctx context.Context, svcCtx *svc.ServiceContext) *GetHpcTaskLogLogic { | |||
| cache := make(map[string]interface{}, 10) | |||
| hpcService, err := service.NewHpcService(&svcCtx.Config, svcCtx.Scheduler.HpcStorages, cache) | |||
| if err != nil { | |||
| return nil | |||
| } | |||
| return &GetHpcTaskLogLogic{ | |||
| Logger: logx.WithContext(ctx), | |||
| ctx: ctx, | |||
| svcCtx: svcCtx, | |||
| hpcService: hpcService, | |||
| } | |||
| } | |||
| func (l *GetHpcTaskLogLogic) GetHpcTaskLog(req *types.HpcTaskLogReq) (resp interface{}, err error) { | |||
| var hpcR TaskHPCResult | |||
| tx := l.svcCtx.DbEngin.Raw( | |||
| "SELECT t.id, hpc.job_id ,hpc.adapter_id FROM task t "+ | |||
| "INNER JOIN task_hpc hpc ON t.id = hpc.task_id "+ | |||
| "WHERE adapter_type_dict = 2 AND t.id = ?", | |||
| req.TaskId, | |||
| ).Scan(&hpcR).Error | |||
| if tx != nil { | |||
| return nil, fmt.Errorf("数据库查询失败: %v", tx.Error) | |||
| } | |||
| var adapterInfo types.AdapterInfo | |||
| l.svcCtx.DbEngin.Raw("SELECT * FROM `t_adapter` where id = ?", hpcR.AdapterId).Scan(&adapterInfo) | |||
| if adapterInfo.Id == "" { | |||
| return nil, fmt.Errorf("adapter not found") | |||
| } | |||
| // 取消作业 | |||
| resp, err = l.hpcService.HpcExecutorAdapterMap[adapterInfo.Id].GetTaskLogs(l.ctx, hpcR.JobID) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| return resp, nil | |||
| } | |||
| @@ -9,6 +9,8 @@ import ( | |||
| type HPCCollector interface { | |||
| GetTask(ctx context.Context, taskId string) (*Task, error) | |||
| SubmitTask(ctx context.Context, req types.CommitHpcTaskReq) (*types.CommitHpcTaskResp, error) | |||
| CancelTask(ctx context.Context, jobId string) error | |||
| GetTaskLogs(ctx context.Context, jobId string) (interface{}, error) | |||
| } | |||
| type JobInfo struct { | |||
| @@ -19,8 +19,11 @@ type ParticipantHpc struct { | |||
| } | |||
| const ( | |||
| BackendSlurm = "slurm" | |||
| JobDetailUrl = "/api/v1/jobs/detail/{backend}/{jobId}" | |||
| SubmitTaskUrl = "/api/v1/jobs" | |||
| CancelTaskUrl = "/api/v1/jobs/cancel/{backend}/{jobId}" | |||
| JobLogUrl = "/api/v1/jobs/logs/{backend}/{jobId}" | |||
| ) | |||
| func NewHpc(host string, id int64, platform string) *ParticipantHpc { | |||
| @@ -91,3 +94,41 @@ func (c *ParticipantHpc) SubmitTask(ctx context.Context, req types.CommitHpcTask | |||
| return &resp, nil | |||
| } | |||
| func (c *ParticipantHpc) CancelTask(ctx context.Context, jobId string) error { | |||
| reqUrl := c.host + CancelTaskUrl | |||
| resp := types.CommonResp{} | |||
| logx.WithContext(ctx).Infof("取消超算集群任务, url: %s, jobId: %s", reqUrl, jobId) | |||
| httpClient := resty.New().R() | |||
| _, err := httpClient.SetHeaders( | |||
| map[string]string{ | |||
| "Content-Type": "application/json", | |||
| "traceId": result.TraceIDFromContext(ctx), | |||
| }).SetPathParams(map[string]string{ | |||
| "backend": BackendSlurm, | |||
| "jobId": jobId, | |||
| }).SetResult(&resp).Delete(reqUrl) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| return nil | |||
| } | |||
| func (c *ParticipantHpc) GetTaskLogs(ctx context.Context, jobId string) (interface{}, error) { | |||
| reqUrl := c.host + JobLogUrl | |||
| resp := types.CommonResp{} | |||
| logx.WithContext(ctx).Infof("获取超算集群任务日志, url: %s, jobId: %s", reqUrl, jobId) | |||
| httpClient := resty.New().R() | |||
| _, err := httpClient.SetHeaders( | |||
| map[string]string{ | |||
| "Content-Type": "application/json", | |||
| "traceId": result.TraceIDFromContext(ctx), | |||
| }).SetPathParams(map[string]string{ | |||
| "backend": BackendSlurm, | |||
| "jobId": jobId, | |||
| }).SetResult(&resp).Get(reqUrl) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| return resp, nil | |||
| } | |||