zhangwei 2 months ago
parent
commit
e5b41573a5
9 changed files with 179 additions and 14 deletions
  1. +6
    -0
      internal/handler/routes.go
  2. +14
    -0
      internal/handler/xjlab/task.go
  3. +27
    -2
      internal/logic/schedule/schedulegetaijoblogloglogic.go
  4. +82
    -0
      internal/logic/xjlab/task_analyze.go
  5. +11
    -0
      internal/scheduler/database/aiStorage.go
  6. +1
    -1
      internal/scheduler/service/collector/hpc_collector.go
  7. +26
    -0
      internal/scheduler/service/hpc/slurm.go
  8. +1
    -1
      internal/storeLink/openi.go
  9. +11
    -10
      pkg/constants/task.go

+ 6
- 0
internal/handler/routes.go View File

@@ -1757,6 +1757,12 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) {
Path: "/xjlab/taskStatusStatistics", Path: "/xjlab/taskStatusStatistics",
Handler: xjlab.TaskStatusStatisticsHandler(serverCtx), Handler: xjlab.TaskStatusStatisticsHandler(serverCtx),
}, },
{
//单任务失败分析
Method: http.MethodGet,
Path: "/xjlab/task/failureAnalyze",
Handler: xjlab.TaskFailureAnalyzeHandler(serverCtx),
},
}, },
rest.WithPrefix("/pcm/v1"), rest.WithPrefix("/pcm/v1"),
) )


+ 14
- 0
internal/handler/xjlab/task.go View File

@@ -62,3 +62,17 @@ func TaskStatusStatisticsHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
result.HttpResult(r, w, resp, err) result.HttpResult(r, w, resp, err)
} }
} }

func TaskFailureAnalyzeHandler(ctx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
var req types.FId
if err := httpx.Parse(r, &req); err != nil {
result.ParamErrorResult(r, w, err)
return
}

l := xjlab.NewTaskFailureAnalyzeLogic(r.Context(), ctx)
resp, err := l.TaskFailureAnalyze(&req)
result.HttpResult(r, w, resp, err)
}
}

+ 27
- 2
internal/logic/schedule/schedulegetaijoblogloglogic.go View File

@@ -2,6 +2,9 @@ package schedule


import ( import (
"context" "context"
"errors"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"strconv"


"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
@@ -26,11 +29,33 @@ func NewScheduleGetAiJobLogLogLogic(ctx context.Context, svcCtx *svc.ServiceCont
func (l *ScheduleGetAiJobLogLogLogic) ScheduleGetAiJobLogLog(req *types.AiJobLogReq) (resp *types.AiJobLogResp, err error) { func (l *ScheduleGetAiJobLogLogLogic) ScheduleGetAiJobLogLog(req *types.AiJobLogReq) (resp *types.AiJobLogResp, err error) {
resp = &types.AiJobLogResp{} resp = &types.AiJobLogResp{}


id, err := l.svcCtx.Scheduler.AiStorages.GetAiTaskIdByClusterIdAndTaskId(req.ClusterId, req.TaskId)
taskId, err := strconv.ParseInt(req.TaskId, 10, 64)
if err != nil { if err != nil {
return nil, err return nil, err
} }
log, err := l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[req.AdapterId][req.ClusterId].GetTrainingTaskLog(l.ctx, id, req.InstanceNum)
task, err := l.svcCtx.Scheduler.AiStorages.GetTaskById(taskId)
if err != nil {
return nil, err
}

aiTasks, err := l.svcCtx.Scheduler.AiStorages.GetAiTasksByTaskId(req.TaskId)
if err != nil {
return nil, err
}

if len(aiTasks) == 0 && task.Status == constants.Failed {
return nil, errors.New("submit failed, no log available")
} else if len(aiTasks) > 1 {
return nil, errors.New("multiple ai task found")
}

aiTask := aiTasks[0]
adapterId := strconv.FormatInt(aiTask.AdapterId, 10)
clusterId := strconv.FormatInt(aiTask.ClusterId, 10)

jobId := aiTask.JobId

log, err := l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[adapterId][clusterId].GetTrainingTaskLog(l.ctx, jobId, req.InstanceNum)
if err != nil { if err != nil {
return nil, err return nil, err
} }


+ 82
- 0
internal/logic/xjlab/task_analyze.go View File

@@ -0,0 +1,82 @@
package xjlab

import (
"context"
"fmt"

"github.com/pkg/errors"
"github.com/zeromicro/go-zero/core/logx"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"gorm.io/gorm"
)

type TaskFailureAnalyzeLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
hpcService *service.HpcService
}

func NewTaskFailureAnalyzeLogic(ctx context.Context, svcCtx *svc.ServiceContext) *TaskFailureAnalyzeLogic {
cache := make(map[string]interface{}, 10)
hpcService, err := service.NewHpcService(&svcCtx.Config, svcCtx.Scheduler.HpcStorages, cache)
if err != nil {
return nil
}
return &TaskFailureAnalyzeLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
hpcService: hpcService,
}
}

func (l *TaskFailureAnalyzeLogic) TaskFailureAnalyze(req *types.FId) (interface{}, error) {
task := &models.Task{}
var resp interface{}
if errors.Is(l.svcCtx.DbEngin.Where("id", req.Id).First(&task).Error, gorm.ErrRecordNotFound) {
return nil, errors.New("记录不存在")
}
switch task.AdapterTypeDict {
case constants.AdapterTypeCloud:
return nil, nil
case constants.AdapterTypeAI:
return nil, nil
case constants.AdapterTypeHPC:
// 获取HPC任务失败分析
usage, err := l.GetHpcTaskFailureAnalyze(req)
if err != nil {
return nil, err
}
resp = usage
}
return resp, nil
}

func (l *TaskFailureAnalyzeLogic) GetHpcTaskFailureAnalyze(req *types.FId) (resp interface{}, err error) {
var hpcR TaskHPCResult
tx := l.svcCtx.DbEngin.Raw(
"SELECT t.id, hpc.job_id ,hpc.adapter_id ,hpc.cluster_id FROM task t "+
"INNER JOIN task_hpc hpc ON t.id = hpc.task_id "+
"WHERE adapter_type_dict = 2 AND t.id = ?",
req.Id,
).Scan(&hpcR).Error
if tx != nil {
return nil, fmt.Errorf("数据库查询失败: %v", tx.Error)
}
if hpcR.ID == 0 {
return nil, fmt.Errorf("任务不存在")
}

// 获取资源使用情况
resp, err = l.hpcService.HpcExecutorAdapterMap[hpcR.AdapterId].GetHpcTaskFailureAnalyze(l.ctx, hpcR.JobID, hpcR.ClusterId)
if err != nil {
return nil, err
}

return resp, nil
}

+ 11
- 0
internal/scheduler/database/aiStorage.go View File

@@ -99,6 +99,17 @@ func (s *AiStorage) GetAiTasksByAdapterId(adapterId string) ([]*models.TaskAi, e
return resp, nil return resp, nil
} }


func (s *AiStorage) GetAiTasksByTaskId(taskId string) ([]*models.TaskAi, error) {
var resp []*models.TaskAi
db := s.DbEngin.Model(&models.TaskAi{}).Table("task_ai")
db = db.Where("task_id = ?", taskId)
err := db.Order("commit_time desc").Find(&resp).Error
if err != nil {
return nil, err
}
return resp, nil
}

func (s *AiStorage) GetAiTaskListById(id int64) ([]*models.TaskAi, error) { func (s *AiStorage) GetAiTaskListById(id int64) ([]*models.TaskAi, error) {
var aiTaskList []*models.TaskAi var aiTaskList []*models.TaskAi
tx := s.DbEngin.Raw("select * from task_ai where `task_id` = ? ", id).Scan(&aiTaskList) tx := s.DbEngin.Raw("select * from task_ai where `task_id` = ? ", id).Scan(&aiTaskList)


+ 1
- 1
internal/scheduler/service/collector/hpc_collector.go View File

@@ -14,7 +14,7 @@ type HPCCollector interface {
CancelTask(ctx context.Context, jobId string, clusterId string) error CancelTask(ctx context.Context, jobId string, clusterId string) error
GetTaskLogs(ctx context.Context, jobId string, clusterId string) (interface{}, error) GetTaskLogs(ctx context.Context, jobId string, clusterId string) (interface{}, error)
GetTaskResourceUsage(ctx context.Context, jobId string, clusterId string) (interface{}, error) GetTaskResourceUsage(ctx context.Context, jobId string, clusterId string) (interface{}, error)
//RescheduleTask(ctx context.Context, jobId string, clusterId string) error
GetHpcTaskFailureAnalyze(ctx context.Context, jobId string, clusterId string) (interface{}, error)
} }


type JobInfo struct { type JobInfo struct {


+ 26
- 0
internal/scheduler/service/hpc/slurm.go View File

@@ -30,6 +30,7 @@ const (
JobLogUrl = "/api/v1/hpc/jobs/logs/{clusterId}/{jobId}" JobLogUrl = "/api/v1/hpc/jobs/logs/{clusterId}/{jobId}"
CancelTaskUrl = "/api/v1/hpc/jobs/cancel/{clusterId}/{jobId}" CancelTaskUrl = "/api/v1/hpc/jobs/cancel/{clusterId}/{jobId}"
JobResourceUsageUrl = "/api/v1/hpc/jobs/resource/usage/{clusterId}/{jobId}" JobResourceUsageUrl = "/api/v1/hpc/jobs/resource/usage/{clusterId}/{jobId}"
JobFailureAnalyze = "/api/v1/hpc/task/analyze"
) )


func NewHpc(host string, id int64, platform string) *ParticipantHpc { func NewHpc(host string, id int64, platform string) *ParticipantHpc {
@@ -205,3 +206,28 @@ func (c *ParticipantHpc) GetTaskResourceUsage(ctx context.Context, jobId string,
} }
return resp, nil return resp, nil
} }

func (c *ParticipantHpc) GetHpcTaskFailureAnalyze(ctx context.Context, jobId string, clusterId string) (interface{}, error) {
logx.WithContext(ctx).Infof("获取超算集群任务失败分析, url: %s, jobId: %s", JobFailureAnalyze, jobId)
if jobId == "" {
return nil, fmt.Errorf("jobId is empty")
}
resp := types.CommonResp{}
_, err := c.Request(JobFailureAnalyze, http.MethodPost, func(req *resty.Request) {
req.SetHeaders(map[string]string{
"Content-Type": "application/json",
"traceId": result.TraceIDFromContext(ctx),
}).SetBody(map[string]string{
"JobId": jobId,
"clusterId": clusterId,
"clusterType": "hpc",
}).SetResult(&resp)
})
if err != nil {
return nil, err
}
if resp.Code != http.StatusOK {
return nil, fmt.Errorf(resp.Msg)
}
return resp, nil
}

+ 1
- 1
internal/storeLink/openi.go View File

@@ -240,7 +240,7 @@ func (o *OpenI) SubmitTask(ctx context.Context, imageId string, cmd string, envs
Description: algorithmId, // temporarily set reponame contained in the algorithmId to desc for missing taskdetail's reponame Description: algorithmId, // temporarily set reponame contained in the algorithmId to desc for missing taskdetail's reponame
JobType: TRAIN, JobType: TRAIN,
Cluster: C2NET, Cluster: C2NET,
DisplayJobName: TRAIN + UNDERSCORE + utils.RandomString(10),
DisplayJobName: strings.ToLower(TRAIN + UNDERSCORE + utils.RandomString(10)),
ComputeSource: computeSource, ComputeSource: computeSource,
SpecId: int(specId), SpecId: int(specId),
BranchName: branchName, BranchName: branchName,


+ 11
- 10
pkg/constants/task.go View File

@@ -16,19 +16,20 @@ package constants


// 任务状态类型 // 任务状态类型
const ( const (
Saved = "Saved"
Running = "Running"
WaitDelete = "WaitDelete" WaitDelete = "WaitDelete"
Deleted = "Deleted" Deleted = "Deleted"
Completed = "Completed"
Succeeded = "Succeeded"
Failed = "Failed"
WaitRestart = "WaitRestart" WaitRestart = "WaitRestart"
WaitPause = "WaitPause" WaitPause = "WaitPause"
WaitStart = "WaitStart" WaitStart = "WaitStart"
Pending = "Pending"
Stopped = "Stopped"
Deploying = "Deploying"
Cancelled = "Cancelled"
Waiting = "Waiting"

Completed = "Completed" //已完成
Succeeded = "Succeeded" //成功
Failed = "Failed" //失败
Saved = "Saved" //已保存
Running = "Running" //运行中
Pending = "Pending" //挂起
Stopped = "Stopped" //已停止
Deploying = "Deploying" //部署中
Cancelled = "Cancelled" //已取消
Waiting = "Waiting" //等待中
) )

Loading…
Cancel
Save