| @@ -327,3 +327,79 @@ func InitAiClusterMap(conf *config.Config, clusters []types.ClusterInfo) (map[st | |||
| return executorMap, collectorMap | |||
| } | |||
| func UpdateClusterResource(svc *svc.ServiceContext) { | |||
| list, err := svc.Scheduler.AiStorages.GetAdaptersByType("1") | |||
| if err != nil { | |||
| return | |||
| } | |||
| var wg sync.WaitGroup | |||
| for _, adapter := range list { | |||
| clusters, err := svc.Scheduler.AiStorages.GetClustersByAdapterId(adapter.Id) | |||
| if err != nil { | |||
| continue | |||
| } | |||
| for _, cluster := range clusters.List { | |||
| c := cluster | |||
| clusterResource, err := svc.Scheduler.AiStorages.GetClusterResourcesById(c.Id) | |||
| if err != nil { | |||
| continue | |||
| } | |||
| wg.Add(1) | |||
| go func() { | |||
| _, ok := svc.Scheduler.AiService.AiCollectorAdapterMap[adapter.Id][c.Id] | |||
| if !ok { | |||
| wg.Done() | |||
| return | |||
| } | |||
| h := http.Request{} | |||
| stat, err := svc.Scheduler.AiService.AiCollectorAdapterMap[adapter.Id][c.Id].GetResourceStats(h.Context()) | |||
| if err != nil { | |||
| wg.Done() | |||
| return | |||
| } | |||
| if stat == nil { | |||
| wg.Done() | |||
| return | |||
| } | |||
| clusterType, err := strconv.ParseInt(adapter.Type, 10, 64) | |||
| if err != nil { | |||
| wg.Done() | |||
| return | |||
| } | |||
| var cardTotal int64 | |||
| var topsTotal float64 | |||
| for _, card := range stat.CardsAvail { | |||
| cardTotal += int64(card.CardNum) | |||
| topsTotal += card.TOpsAtFp16 * float64(card.CardNum) | |||
| } | |||
| if (models.TClusterResource{} == *clusterResource) { | |||
| err = svc.Scheduler.AiStorages.SaveClusterResources(c.Id, c.Name, clusterType, float64(stat.CpuCoreAvail), float64(stat.CpuCoreTotal), | |||
| stat.MemAvail, stat.MemTotal, stat.DiskAvail, stat.DiskTotal, float64(stat.GpuAvail), float64(stat.GpuTotal), cardTotal, topsTotal) | |||
| if err != nil { | |||
| wg.Done() | |||
| return | |||
| } | |||
| } else { | |||
| clusterResource.CardTotal = cardTotal | |||
| clusterResource.CardTopsTotal = topsTotal | |||
| clusterResource.CpuAvail = float64(stat.CpuCoreAvail) | |||
| clusterResource.CpuTotal = float64(stat.CpuCoreTotal) | |||
| clusterResource.MemAvail = stat.MemAvail | |||
| clusterResource.MemTotal = stat.MemTotal | |||
| clusterResource.DiskAvail = stat.DiskAvail | |||
| clusterResource.DiskTotal = stat.DiskTotal | |||
| err := svc.Scheduler.AiStorages.UpdateClusterResources(clusterResource) | |||
| if err != nil { | |||
| wg.Done() | |||
| return | |||
| } | |||
| } | |||
| wg.Done() | |||
| }() | |||
| } | |||
| } | |||
| wg.Wait() | |||
| } | |||
| @@ -44,4 +44,7 @@ func AddCronGroup(svc *svc.ServiceContext) { | |||
| UpdateAiAdapterMaps(svc) | |||
| }) | |||
| svc.Cron.AddFunc("30 21 * * *", func() { | |||
| UpdateClusterResource(svc) | |||
| }) | |||
| } | |||
| @@ -137,6 +137,13 @@ func (l *GetCenterOverviewLogic) updateClusterResource(mu *sync.RWMutex, ch chan | |||
| } else { | |||
| clusterResource.CardTotal = cardTotal | |||
| clusterResource.CardTopsTotal = topsTotal | |||
| clusterResource.CpuAvail = float64(stat.CpuCoreAvail) | |||
| clusterResource.CpuTotal = float64(stat.CpuCoreTotal) | |||
| clusterResource.MemAvail = stat.MemAvail | |||
| clusterResource.MemTotal = stat.MemTotal | |||
| clusterResource.DiskAvail = stat.DiskAvail | |||
| clusterResource.DiskTotal = stat.DiskTotal | |||
| err := l.svcCtx.Scheduler.AiStorages.UpdateClusterResources(clusterResource) | |||
| if err != nil { | |||
| mu.Unlock() | |||
| @@ -2,12 +2,11 @@ package core | |||
| import ( | |||
| "context" | |||
| "github.com/zeromicro/go-zero/core/logx" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/tracker" | |||
| "strconv" | |||
| "github.com/zeromicro/go-zero/core/logx" | |||
| tool "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" | |||
| ) | |||
| type SyncClusterLoadLogic struct { | |||
| @@ -25,24 +24,10 @@ func NewSyncClusterLoadLogic(ctx context.Context, svcCtx *svc.ServiceContext) *S | |||
| } | |||
| func (l *SyncClusterLoadLogic) SyncClusterLoad(req *types.SyncClusterLoadReq) error { | |||
| if len(req.ClusterLoadRecords) != 0 { | |||
| for _, record := range req.ClusterLoadRecords { | |||
| tracker.ClusterCpuUtilisationGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.CpuUtilisation) | |||
| tracker.ClusterCpuAvailGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.CpuAvail) | |||
| tracker.ClusterCpuTotalGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.CpuTotal) | |||
| tracker.ClusterMemoryUtilisationGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.MemoryUtilisation) | |||
| tracker.ClusterMemoryAvailGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.MemoryAvail) | |||
| tracker.ClusterMemoryTotalGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.MemoryTotal) | |||
| tracker.ClusterDiskUtilisationGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.DiskUtilisation) | |||
| tracker.ClusterDiskAvailGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.DiskAvail) | |||
| tracker.ClusterDiskTotalGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.DiskTotal) | |||
| tracker.ClusterPodUtilisationGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.PodsUtilisation) | |||
| tracker.ClusterPodCountGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(float64(record.PodsCount)) | |||
| tracker.ClusterPodTotalGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(float64(record.PodsTotal)) | |||
| } | |||
| if nil != req.ClusterLoadRecords { | |||
| var param tracker.ClusterLoadRecord | |||
| tool.Convert(req, ¶m) | |||
| tracker.SyncClusterLoad(param) | |||
| } | |||
| return nil | |||
| } | |||
| @@ -6,6 +6,7 @@ import ( | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/tracker" | |||
| "gorm.io/gorm" | |||
| "strconv" | |||
| "time" | |||
| @@ -211,6 +212,17 @@ func (s *AiStorage) SaveClusterResources(clusterId string, clusterName string, c | |||
| if tx.Error != nil { | |||
| return tx.Error | |||
| } | |||
| // prometheus | |||
| param := tracker.ClusterLoadRecord{ | |||
| ClusterName: clusterName, | |||
| CpuAvail: cpuAvail, | |||
| CpuTotal: cpuTotal, | |||
| MemoryAvail: memAvail, | |||
| MemoryTotal: memTotal, | |||
| DiskAvail: diskAvail, | |||
| DiskTotal: diskTotal, | |||
| } | |||
| tracker.SyncClusterLoad(param) | |||
| return nil | |||
| } | |||
| @@ -219,6 +231,17 @@ func (s *AiStorage) UpdateClusterResources(clusterResource *models.TClusterResou | |||
| if tx.Error != nil { | |||
| return tx.Error | |||
| } | |||
| // prometheus | |||
| param := tracker.ClusterLoadRecord{ | |||
| ClusterName: clusterResource.ClusterName, | |||
| CpuAvail: clusterResource.CpuAvail, | |||
| CpuTotal: clusterResource.CpuTotal, | |||
| MemoryAvail: clusterResource.MemAvail, | |||
| MemoryTotal: clusterResource.MemTotal, | |||
| DiskAvail: clusterResource.DiskAvail, | |||
| DiskTotal: clusterResource.DiskTotal, | |||
| } | |||
| tracker.SyncClusterLoad(param) | |||
| return nil | |||
| } | |||
| @@ -107,6 +107,23 @@ var ( | |||
| } | |||
| ) | |||
| type ClusterLoadRecord struct { | |||
| AdapterId int64 `json:"adapterId,optional"` | |||
| ClusterName string `json:"clusterName,optional"` | |||
| CpuAvail float64 `json:"cpuAvail,optional"` | |||
| CpuTotal float64 `json:"cpuTotal,optional"` | |||
| CpuUtilisation float64 `json:"cpuUtilisation,optional"` | |||
| MemoryAvail float64 `json:"memoryAvail,optional"` | |||
| MemoryUtilisation float64 `json:"memoryUtilisation,optional"` | |||
| MemoryTotal float64 `json:"memoryTotal,optional"` | |||
| DiskAvail float64 `json:"diskAvail,optional"` | |||
| DiskTotal float64 `json:"diskTotal,optional"` | |||
| DiskUtilisation float64 `json:"diskUtilisation,optional"` | |||
| PodsUtilisation float64 `json:"podsUtilisation,optional"` | |||
| PodsCount int64 `json:"podsCount,optional"` | |||
| PodsTotal int64 `json:"podsTotal,optional"` | |||
| } | |||
| func init() { | |||
| prometheus.MustRegister(metrics...) | |||
| } | |||
| @@ -302,3 +319,21 @@ func (p Prometheus) GetRawData(expr string, o QueryOption) (model.Value, error) | |||
| } | |||
| return value, nil | |||
| } | |||
| func SyncClusterLoad(record ClusterLoadRecord) { | |||
| ClusterCpuUtilisationGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.CpuUtilisation) | |||
| ClusterCpuAvailGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.CpuAvail) | |||
| ClusterCpuTotalGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.CpuTotal) | |||
| ClusterMemoryUtilisationGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.MemoryUtilisation) | |||
| ClusterMemoryAvailGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.MemoryAvail) | |||
| ClusterMemoryTotalGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.MemoryTotal) | |||
| ClusterDiskUtilisationGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.DiskUtilisation) | |||
| ClusterDiskAvailGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.DiskAvail) | |||
| ClusterDiskTotalGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.DiskTotal) | |||
| ClusterPodUtilisationGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.PodsUtilisation) | |||
| ClusterPodCountGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(float64(record.PodsCount)) | |||
| ClusterPodTotalGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(float64(record.PodsTotal)) | |||
| } | |||