|
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182 |
- package xjlab
-
- import (
- "context"
- "fmt"
-
- "github.com/pkg/errors"
- "github.com/zeromicro/go-zero/core/logx"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
- "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
- "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
- "gorm.io/gorm"
- )
-
- type TaskFailureAnalyzeLogic struct {
- logx.Logger
- ctx context.Context
- svcCtx *svc.ServiceContext
- hpcService *service.HpcService
- }
-
- func NewTaskFailureAnalyzeLogic(ctx context.Context, svcCtx *svc.ServiceContext) *TaskFailureAnalyzeLogic {
- cache := make(map[string]interface{}, 10)
- hpcService, err := service.NewHpcService(&svcCtx.Config, svcCtx.Scheduler.HpcStorages, cache)
- if err != nil {
- return nil
- }
- return &TaskFailureAnalyzeLogic{
- Logger: logx.WithContext(ctx),
- ctx: ctx,
- svcCtx: svcCtx,
- hpcService: hpcService,
- }
- }
-
- func (l *TaskFailureAnalyzeLogic) TaskFailureAnalyze(req *types.FId) (interface{}, error) {
- task := &models.Task{}
- var resp interface{}
- if errors.Is(l.svcCtx.DbEngin.Where("id", req.Id).First(&task).Error, gorm.ErrRecordNotFound) {
- return nil, errors.New("记录不存在")
- }
- switch task.AdapterTypeDict {
- case constants.AdapterTypeCloud:
- return nil, nil
- case constants.AdapterTypeAI:
- return nil, nil
- case constants.AdapterTypeHPC:
- // 获取HPC任务失败分析
- usage, err := l.GetHpcTaskFailureAnalyze(req)
- if err != nil {
- return nil, err
- }
- resp = usage
- }
- return resp, nil
- }
-
- func (l *TaskFailureAnalyzeLogic) GetHpcTaskFailureAnalyze(req *types.FId) (resp interface{}, err error) {
- var hpcR TaskHPCResult
- tx := l.svcCtx.DbEngin.Raw(
- "SELECT t.id, hpc.job_id ,hpc.adapter_id ,hpc.cluster_id FROM task t "+
- "INNER JOIN task_hpc hpc ON t.id = hpc.task_id "+
- "WHERE adapter_type_dict = 2 AND t.id = ?",
- req.Id,
- ).Scan(&hpcR).Error
- if tx != nil {
- return nil, fmt.Errorf("数据库查询失败: %v", tx.Error)
- }
- if hpcR.ID == 0 {
- return nil, fmt.Errorf("任务不存在")
- }
-
- // 获取资源使用情况
- resp, err = l.hpcService.HpcExecutorAdapterMap[hpcR.AdapterId].GetHpcTaskFailureAnalyze(l.ctx, hpcR.JobID, hpcR.ClusterId)
- if err != nil {
- return nil, err
- }
-
- return resp, nil
- }
|