You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

getcenteroverviewlogic.go 3.6 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. package ai
  2. import (
  3. "context"
  4. "github.com/zeromicro/go-zero/core/logx"
  5. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
  6. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
  7. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
  8. "strconv"
  9. "sync"
  10. )
  11. type GetCenterOverviewLogic struct {
  12. logx.Logger
  13. ctx context.Context
  14. svcCtx *svc.ServiceContext
  15. }
  16. func NewGetCenterOverviewLogic(ctx context.Context, svcCtx *svc.ServiceContext) *GetCenterOverviewLogic {
  17. return &GetCenterOverviewLogic{
  18. Logger: logx.WithContext(ctx),
  19. ctx: ctx,
  20. svcCtx: svcCtx,
  21. }
  22. }
  23. func (l *GetCenterOverviewLogic) GetCenterOverview() (resp *types.CenterOverviewResp, err error) {
  24. resp = &types.CenterOverviewResp{}
  25. var mu sync.RWMutex
  26. ch := make(chan struct{})
  27. var centerNum int32
  28. var taskNum int32
  29. var cardNum int32
  30. var totalTops float64
  31. adapterList, err := l.svcCtx.Scheduler.AiStorages.GetAdaptersByType("1")
  32. if err != nil {
  33. return nil, err
  34. }
  35. centerNum = int32(len(adapterList))
  36. resp.CenterNum = centerNum
  37. go l.updateClusterResource(&mu, ch, adapterList)
  38. for _, adapter := range adapterList {
  39. taskList, err := l.svcCtx.Scheduler.AiStorages.GetAiTasksByAdapterId(adapter.Id)
  40. if err != nil {
  41. continue
  42. }
  43. taskNum += int32(len(taskList))
  44. }
  45. resp.TaskNum = taskNum
  46. for _, adapter := range adapterList {
  47. clusters, err := l.svcCtx.Scheduler.AiStorages.GetClustersByAdapterId(adapter.Id)
  48. if err != nil {
  49. continue
  50. }
  51. for _, cluster := range clusters.List {
  52. mu.RLock()
  53. clusterResource, err := l.svcCtx.Scheduler.AiStorages.GetClusterResourcesById(cluster.Id)
  54. mu.RUnlock()
  55. if err != nil {
  56. continue
  57. }
  58. cardNum += int32(clusterResource.CardTotal)
  59. totalTops += clusterResource.CardTopsTotal
  60. }
  61. }
  62. resp.CardNum = cardNum
  63. resp.PowerInTops = totalTops
  64. <-ch
  65. return resp, nil
  66. }
  67. func (l *GetCenterOverviewLogic) updateClusterResource(mu *sync.RWMutex, ch chan<- struct{}, list []*types.AdapterInfo) {
  68. var wg sync.WaitGroup
  69. for _, adapter := range list {
  70. clusters, err := l.svcCtx.Scheduler.AiStorages.GetClustersByAdapterId(adapter.Id)
  71. if err != nil {
  72. continue
  73. }
  74. for _, cluster := range clusters.List {
  75. c := cluster
  76. mu.RLock()
  77. clusterResource, err := l.svcCtx.Scheduler.AiStorages.GetClusterResourcesById(c.Id)
  78. mu.RUnlock()
  79. if err != nil {
  80. continue
  81. }
  82. wg.Add(1)
  83. go func() {
  84. stat, err := l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[adapter.Id][c.Id].GetResourceStats(l.ctx)
  85. if err != nil {
  86. wg.Done()
  87. return
  88. }
  89. clusterType, err := strconv.ParseInt(adapter.Type, 10, 64)
  90. if err != nil {
  91. wg.Done()
  92. return
  93. }
  94. var cardTotal int64
  95. var topsTotal float64
  96. for _, card := range stat.CardsAvail {
  97. cardTotal += int64(card.CardNum)
  98. topsTotal += card.TOpsAtFp16 * float64(card.CardNum)
  99. }
  100. mu.Lock()
  101. if (models.TClusterResource{} == *clusterResource) {
  102. err = l.svcCtx.Scheduler.AiStorages.SaveClusterResources(c.Id, c.Name, clusterType, float64(stat.CpuCoreAvail), float64(stat.CpuCoreTotal),
  103. stat.MemAvail, stat.MemTotal, stat.DiskAvail, stat.DiskTotal, float64(stat.GpuAvail), float64(stat.GpuTotal), cardTotal, topsTotal)
  104. if err != nil {
  105. mu.Unlock()
  106. wg.Done()
  107. return
  108. }
  109. } else {
  110. clusterResource.CardTotal = cardTotal
  111. clusterResource.CardTopsTotal = topsTotal
  112. err := l.svcCtx.Scheduler.AiStorages.UpdateClusterResources(clusterResource)
  113. if err != nil {
  114. mu.Unlock()
  115. wg.Done()
  116. return
  117. }
  118. }
  119. mu.Unlock()
  120. wg.Done()
  121. }()
  122. }
  123. }
  124. wg.Wait()
  125. ch <- struct{}{}
  126. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.