| @@ -2,7 +2,6 @@ module gitlink.org.cn/JointCloud/pcm-coordinator | |||||
| go 1.22.0 | go 1.22.0 | ||||
| require ( | require ( | ||||
| github.com/JCCE-nudt/apigw-go-sdk v0.0.0-20230525025609-34159d6f2818 | github.com/JCCE-nudt/apigw-go-sdk v0.0.0-20230525025609-34159d6f2818 | ||||
| github.com/Masterminds/squirrel v1.5.4 | github.com/Masterminds/squirrel v1.5.4 | ||||
| @@ -19,7 +18,7 @@ require ( | |||||
| github.com/prometheus/common v0.54.0 | github.com/prometheus/common v0.54.0 | ||||
| github.com/robfig/cron/v3 v3.0.1 | github.com/robfig/cron/v3 v3.0.1 | ||||
| github.com/zeromicro/go-zero v1.6.5 | github.com/zeromicro/go-zero v1.6.5 | ||||
| gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240619113316-c0186ee7b60c | |||||
| gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240712090657-cfba062e68e1 | |||||
| gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240620065702-5dcad373c1fe | gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240620065702-5dcad373c1fe | ||||
| gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240510133934-6a5526289b35 | gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240510133934-6a5526289b35 | ||||
| gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203 | gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203 | ||||
| @@ -244,7 +244,8 @@ func (s *AiStorage) GetClusterResourcesById(clusterId string) (*models.TClusterR | |||||
| } | } | ||||
| func (s *AiStorage) SaveClusterResources(adapterId string, clusterId string, clusterName string, clusterType int64, cpuAvail float64, cpuTotal float64, | func (s *AiStorage) SaveClusterResources(adapterId string, clusterId string, clusterName string, clusterType int64, cpuAvail float64, cpuTotal float64, | ||||
| memAvail float64, memTotal float64, diskAvail float64, diskTotal float64, gpuAvail float64, gpuTotal float64, cardTotal int64, topsTotal float64) error { | |||||
| memAvail float64, memTotal float64, diskAvail float64, diskTotal float64, gpuAvail float64, gpuTotal float64, cardTotal int64, topsTotal float64, cardHours float64, | |||||
| balance float64, taskCompleted int64) error { | |||||
| cId, err := strconv.ParseInt(clusterId, 10, 64) | cId, err := strconv.ParseInt(clusterId, 10, 64) | ||||
| if err != nil { | if err != nil { | ||||
| return err | return err | ||||
| @@ -268,6 +269,9 @@ func (s *AiStorage) SaveClusterResources(adapterId string, clusterId string, clu | |||||
| GpuTotal: gpuTotal, | GpuTotal: gpuTotal, | ||||
| CardTotal: cardTotal, | CardTotal: cardTotal, | ||||
| CardTopsTotal: topsTotal, | CardTopsTotal: topsTotal, | ||||
| CardHours: cardHours, | |||||
| Balance: balance, | |||||
| TaskCompleted: taskCompleted, | |||||
| } | } | ||||
| tx := s.DbEngin.Create(&clusterResource) | tx := s.DbEngin.Create(&clusterResource) | ||||
| if tx.Error != nil { | if tx.Error != nil { | ||||
| @@ -17,19 +17,20 @@ type AiCollector interface { | |||||
| } | } | ||||
| type ResourceStats struct { | type ResourceStats struct { | ||||
| ClusterId string | |||||
| Name string | |||||
| CpuCoreAvail int64 | |||||
| CpuCoreTotal int64 | |||||
| MemAvail float64 | |||||
| MemTotal float64 | |||||
| DiskAvail float64 | |||||
| DiskTotal float64 | |||||
| GpuAvail int64 | |||||
| GpuTotal int64 | |||||
| CardsAvail []*Card | |||||
| CpuCoreHours float64 | |||||
| Balance float64 | |||||
| ClusterId string | |||||
| Name string | |||||
| CpuCoreAvail int64 | |||||
| CpuCoreTotal int64 | |||||
| MemAvail float64 | |||||
| MemTotal float64 | |||||
| DiskAvail float64 | |||||
| DiskTotal float64 | |||||
| GpuAvail int64 | |||||
| GpuTotal int64 | |||||
| CardsAvail []*Card | |||||
| CpuCoreHours float64 | |||||
| Balance float64 | |||||
| TaskCompleted int64 | |||||
| } | } | ||||
| type Card struct { | type Card struct { | ||||
| @@ -46,14 +46,17 @@ func UpdateClusterResources(svc *svc.ServiceContext, list []*types.AdapterInfo) | |||||
| } | } | ||||
| var cardTotal int64 | var cardTotal int64 | ||||
| var topsTotal float64 | var topsTotal float64 | ||||
| var cardHours float64 | |||||
| for _, card := range stat.CardsAvail { | for _, card := range stat.CardsAvail { | ||||
| cardTotal += int64(card.CardNum) | cardTotal += int64(card.CardNum) | ||||
| topsTotal += card.TOpsAtFp16 * float64(card.CardNum) | topsTotal += card.TOpsAtFp16 * float64(card.CardNum) | ||||
| cardHours += card.CardHours | |||||
| } | } | ||||
| if (models.TClusterResource{} == *clusterResource) { | if (models.TClusterResource{} == *clusterResource) { | ||||
| err = svc.Scheduler.AiStorages.SaveClusterResources(adapter.Id, c.Id, c.Name, clusterType, float64(stat.CpuCoreAvail), float64(stat.CpuCoreTotal), | err = svc.Scheduler.AiStorages.SaveClusterResources(adapter.Id, c.Id, c.Name, clusterType, float64(stat.CpuCoreAvail), float64(stat.CpuCoreTotal), | ||||
| stat.MemAvail, stat.MemTotal, stat.DiskAvail, stat.DiskTotal, float64(stat.GpuAvail), float64(stat.GpuTotal), cardTotal, topsTotal) | |||||
| stat.MemAvail, stat.MemTotal, stat.DiskAvail, stat.DiskTotal, float64(stat.GpuAvail), float64(stat.GpuTotal), cardTotal, topsTotal, cardHours, | |||||
| stat.Balance, stat.TaskCompleted) | |||||
| if err != nil { | if err != nil { | ||||
| wg.Done() | wg.Done() | ||||
| return | return | ||||
| @@ -71,6 +74,9 @@ func UpdateClusterResources(svc *svc.ServiceContext, list []*types.AdapterInfo) | |||||
| clusterResource.MemTotal = stat.MemTotal | clusterResource.MemTotal = stat.MemTotal | ||||
| clusterResource.DiskAvail = stat.DiskAvail | clusterResource.DiskAvail = stat.DiskAvail | ||||
| clusterResource.DiskTotal = stat.DiskTotal | clusterResource.DiskTotal = stat.DiskTotal | ||||
| clusterResource.CardHours = cardHours | |||||
| clusterResource.Balance = stat.Balance | |||||
| clusterResource.TaskCompleted = stat.TaskCompleted | |||||
| err := svc.Scheduler.AiStorages.UpdateClusterResources(clusterResource) | err := svc.Scheduler.AiStorages.UpdateClusterResources(clusterResource) | ||||
| if err != nil { | if err != nil { | ||||
| @@ -49,6 +49,7 @@ const ( | |||||
| CPUCOREPRICEPERHOUR = 0.09 | CPUCOREPRICEPERHOUR = 0.09 | ||||
| DCUPRICEPERHOUR = 2.0 | DCUPRICEPERHOUR = 2.0 | ||||
| KB = 1024 | KB = 1024 | ||||
| TIMEOUT = 20 | |||||
| ) | ) | ||||
| var RESOURCESGAIMAP = map[string]ResourceSpecSGAI{ | var RESOURCESGAIMAP = map[string]ResourceSpecSGAI{ | ||||
| @@ -270,7 +271,7 @@ func (s *ShuguangAi) QuerySpecs(ctx context.Context) (interface{}, error) { | |||||
| func (s *ShuguangAi) GetResourceStats(ctx context.Context) (*collector.ResourceStats, error) { | func (s *ShuguangAi) GetResourceStats(ctx context.Context) (*collector.ResourceStats, error) { | ||||
| var wg sync.WaitGroup | var wg sync.WaitGroup | ||||
| wg.Add(4) | |||||
| wg.Add(5) | |||||
| var cBalance = make(chan float64) | var cBalance = make(chan float64) | ||||
| var cMemTotal = make(chan float64) | var cMemTotal = make(chan float64) | ||||
| var cTotalCpu = make(chan int64) | var cTotalCpu = make(chan int64) | ||||
| @@ -287,6 +288,26 @@ func (s *ShuguangAi) GetResourceStats(ctx context.Context) (*collector.ResourceS | |||||
| TOpsAtFp16: DCU_TOPS, | TOpsAtFp16: DCU_TOPS, | ||||
| } | } | ||||
| //history jobs | |||||
| go func() { | |||||
| hReq := &hpcAC.ListHistoryJobReq{} | |||||
| hReq.Start = 0 | |||||
| hReq.Limit = 1 | |||||
| hReq.IsQueryByQueueTime = "false" | |||||
| hReq.TimeType = "CUSTOM" | |||||
| hReq.StartTime = "2024-01-01 01:01:01" | |||||
| endTime := time.Now().Format("2006-01-02 15:04:05") | |||||
| hReq.EndTime = endTime | |||||
| hResp, err := s.aCRpc.ListHistoryJob(ctx, hReq) | |||||
| if err != nil || hResp.Code != "0" { | |||||
| wg.Done() | |||||
| return | |||||
| } | |||||
| resourceStats.TaskCompleted = int64(hResp.Data.Total) | |||||
| wg.Done() | |||||
| }() | |||||
| //balance | //balance | ||||
| go func() { | go func() { | ||||
| userReq := &hpcAC.GetUserInfoReq{} | userReq := &hpcAC.GetUserInfoReq{} | ||||
| @@ -304,7 +325,7 @@ func (s *ShuguangAi) GetResourceStats(ctx context.Context) (*collector.ResourceS | |||||
| go func() { | go func() { | ||||
| limitReq := &hpcAC.QueueReq{} | limitReq := &hpcAC.QueueReq{} | ||||
| limitResp, err := s.aCRpc.QueryUserQuotasLimit(ctx, limitReq) | limitResp, err := s.aCRpc.QueryUserQuotasLimit(ctx, limitReq) | ||||
| if err != nil { | |||||
| if err != nil || limitResp.Code != "0" { | |||||
| wg.Done() | wg.Done() | ||||
| return | return | ||||
| } | } | ||||
| @@ -351,8 +372,22 @@ func (s *ShuguangAi) GetResourceStats(ctx context.Context) (*collector.ResourceS | |||||
| //resources being occupied | //resources being occupied | ||||
| go func() { | go func() { | ||||
| memSize := <-cMemTotal | |||||
| totalCpu := <-cTotalCpu | |||||
| var memSize float64 | |||||
| var totalCpu int64 | |||||
| select { | |||||
| case v := <-cMemTotal: | |||||
| memSize = v | |||||
| case <-time.After(TIMEOUT * time.Second): | |||||
| wg.Done() | |||||
| return | |||||
| } | |||||
| select { | |||||
| case v := <-cTotalCpu: | |||||
| totalCpu = v | |||||
| case <-time.After(TIMEOUT * time.Second): | |||||
| wg.Done() | |||||
| return | |||||
| } | |||||
| memberJobResp, err := s.aCRpc.GetMemberJobs(ctx, nil) | memberJobResp, err := s.aCRpc.GetMemberJobs(ctx, nil) | ||||
| if err != nil { | if err != nil { | ||||
| wg.Done() | wg.Done() | ||||
| @@ -392,7 +427,7 @@ func (s *ShuguangAi) GetResourceStats(ctx context.Context) (*collector.ResourceS | |||||
| select { | select { | ||||
| case v := <-cBalance: | case v := <-cBalance: | ||||
| balance = v | balance = v | ||||
| case <-time.After(2 * time.Second): | |||||
| case <-time.After(TIMEOUT * time.Second): | |||||
| return nil, errors.New("get balance rpc call failed") | return nil, errors.New("get balance rpc call failed") | ||||
| } | } | ||||
| @@ -402,6 +437,7 @@ func (s *ShuguangAi) GetResourceStats(ctx context.Context) (*collector.ResourceS | |||||
| dcu.CardHours = cardHours | dcu.CardHours = cardHours | ||||
| resourceStats.CpuCoreHours = cpuHours | resourceStats.CpuCoreHours = cpuHours | ||||
| resourceStats.Balance = balance | |||||
| wg.Wait() | wg.Wait() | ||||
| @@ -49,6 +49,9 @@ type ( | |||||
| CardTotal int64 `db:"card_total"` // 算力卡数量 | CardTotal int64 `db:"card_total"` // 算力卡数量 | ||||
| CardTopsTotal float64 `db:"card_tops_total"` // 算力总量tops | CardTopsTotal float64 `db:"card_tops_total"` // 算力总量tops | ||||
| AdapterId int64 `db:"adapter_id"` | AdapterId int64 `db:"adapter_id"` | ||||
| CardHours float64 `db:"card_hours"` | |||||
| Balance float64 `db:"balance"` | |||||
| TaskCompleted int64 `db:"task_completed"` | |||||
| } | } | ||||
| ) | ) | ||||