You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

task_analyze.go 2.3 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. package xjlab
  2. import (
  3. "context"
  4. "fmt"
  5. "github.com/pkg/errors"
  6. "github.com/zeromicro/go-zero/core/logx"
  7. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service"
  8. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
  9. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
  10. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
  11. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
  12. "gorm.io/gorm"
  13. )
  14. type TaskFailureAnalyzeLogic struct {
  15. logx.Logger
  16. ctx context.Context
  17. svcCtx *svc.ServiceContext
  18. hpcService *service.HpcService
  19. }
  20. func NewTaskFailureAnalyzeLogic(ctx context.Context, svcCtx *svc.ServiceContext) *TaskFailureAnalyzeLogic {
  21. cache := make(map[string]interface{}, 10)
  22. hpcService, err := service.NewHpcService(&svcCtx.Config, svcCtx.Scheduler.HpcStorages, cache)
  23. if err != nil {
  24. return nil
  25. }
  26. return &TaskFailureAnalyzeLogic{
  27. Logger: logx.WithContext(ctx),
  28. ctx: ctx,
  29. svcCtx: svcCtx,
  30. hpcService: hpcService,
  31. }
  32. }
  33. func (l *TaskFailureAnalyzeLogic) TaskFailureAnalyze(req *types.FId) (interface{}, error) {
  34. task := &models.Task{}
  35. var resp interface{}
  36. if errors.Is(l.svcCtx.DbEngin.Where("id", req.Id).First(&task).Error, gorm.ErrRecordNotFound) {
  37. return nil, errors.New("记录不存在")
  38. }
  39. switch task.AdapterTypeDict {
  40. case constants.AdapterTypeCloud:
  41. return nil, nil
  42. case constants.AdapterTypeAI:
  43. return nil, nil
  44. case constants.AdapterTypeHPC:
  45. // 获取HPC任务失败分析
  46. usage, err := l.GetHpcTaskFailureAnalyze(req)
  47. if err != nil {
  48. return nil, err
  49. }
  50. resp = usage
  51. }
  52. return resp, nil
  53. }
  54. func (l *TaskFailureAnalyzeLogic) GetHpcTaskFailureAnalyze(req *types.FId) (resp interface{}, err error) {
  55. var hpcR TaskHPCResult
  56. tx := l.svcCtx.DbEngin.Raw(
  57. "SELECT t.id, hpc.job_id ,hpc.adapter_id ,hpc.cluster_id FROM task t "+
  58. "INNER JOIN task_hpc hpc ON t.id = hpc.task_id "+
  59. "WHERE adapter_type_dict = 2 AND t.id = ?",
  60. req.Id,
  61. ).Scan(&hpcR).Error
  62. if tx != nil {
  63. return nil, fmt.Errorf("数据库查询失败: %v", tx.Error)
  64. }
  65. if hpcR.ID == 0 {
  66. return nil, fmt.Errorf("任务不存在")
  67. }
  68. // 获取资源使用情况
  69. resp, err = l.hpcService.HpcExecutorAdapterMap[hpcR.AdapterId].GetHpcTaskFailureAnalyze(l.ctx, hpcR.JobID, hpcR.ClusterId)
  70. if err != nil {
  71. return nil, err
  72. }
  73. return resp, nil
  74. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.