package xjlab import ( "context" "fmt" "github.com/pkg/errors" "github.com/zeromicro/go-zero/core/logx" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" "gorm.io/gorm" ) type TaskFailureAnalyzeLogic struct { logx.Logger ctx context.Context svcCtx *svc.ServiceContext hpcService *service.HpcService } func NewTaskFailureAnalyzeLogic(ctx context.Context, svcCtx *svc.ServiceContext) *TaskFailureAnalyzeLogic { cache := make(map[string]interface{}, 10) hpcService, err := service.NewHpcService(&svcCtx.Config, svcCtx.Scheduler.HpcStorages, cache) if err != nil { return nil } return &TaskFailureAnalyzeLogic{ Logger: logx.WithContext(ctx), ctx: ctx, svcCtx: svcCtx, hpcService: hpcService, } } func (l *TaskFailureAnalyzeLogic) TaskFailureAnalyze(req *types.FId) (interface{}, error) { task := &models.Task{} var resp interface{} if errors.Is(l.svcCtx.DbEngin.Where("id", req.Id).First(&task).Error, gorm.ErrRecordNotFound) { return nil, errors.New("记录不存在") } switch task.AdapterTypeDict { case constants.AdapterTypeCloud: return nil, nil case constants.AdapterTypeAI: return nil, nil case constants.AdapterTypeHPC: // 获取HPC任务失败分析 usage, err := l.GetHpcTaskFailureAnalyze(req) if err != nil { return nil, err } resp = usage } return resp, nil } func (l *TaskFailureAnalyzeLogic) GetHpcTaskFailureAnalyze(req *types.FId) (resp interface{}, err error) { var hpcR TaskHPCResult tx := l.svcCtx.DbEngin.Raw( "SELECT t.id, hpc.job_id ,hpc.adapter_id ,hpc.cluster_id FROM task t "+ "INNER JOIN task_hpc hpc ON t.id = hpc.task_id "+ "WHERE adapter_type_dict = 2 AND t.id = ?", req.Id, ).Scan(&hpcR).Error if tx != nil { return nil, fmt.Errorf("数据库查询失败: %v", tx.Error) } if hpcR.ID == 0 { return nil, fmt.Errorf("任务不存在") } // 获取资源使用情况 resp, err = l.hpcService.HpcExecutorAdapterMap[hpcR.AdapterId].GetHpcTaskFailureAnalyze(l.ctx, hpcR.JobID, hpcR.ClusterId) if err != nil { return nil, err } return resp, nil }