package cron import ( "errors" "fmt" "github.com/zeromicro/go-zero/core/logx" "github.com/zeromicro/go-zero/zrpc" hpcacclient "gitlink.org.cn/JointCloud/pcm-ac/hpcacclient" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/config" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/collector" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/executor" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/storeLink" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" "gitlink.org.cn/JointCloud/pcm-modelarts/client/imagesservice" "gitlink.org.cn/JointCloud/pcm-modelarts/client/modelartsservice" "gitlink.org.cn/JointCloud/pcm-octopus/octopusclient" "net/http" "strconv" "sync" ) const ( OCTOPUS = "octopus" MODELARTS = "modelarts" SHUGUANGAI = "shuguangAi" ) func GetTaskList(svc *svc.ServiceContext) ([]*types.TaskModel, error) { limit := 10 offset := 0 var list []*types.TaskModel db := svc.DbEngin.Model(&types.TaskModel{}).Table("task") db = db.Where("deleted_at is null") //count total var total int64 err := db.Count(&total).Error db.Limit(limit).Offset(offset) if err != nil { return nil, err } err = db.Order("created_time desc").Find(&list).Error if err != nil { return nil, err } return list, nil } func UpdateAiAdapterMaps(svc *svc.ServiceContext) { var aiType = "1" adapterIds, err := svc.Scheduler.AiStorages.GetAdapterIdsByType(aiType) if err != nil { msg := fmt.Sprintf("###UpdateAiAdapterMaps###, error: %v \n", err.Error()) logx.Errorf(errors.New(msg).Error()) return } if len(adapterIds) == 0 { return } for _, id := range adapterIds { clusters, err := svc.Scheduler.AiStorages.GetClustersByAdapterId(id) if err != nil { msg := fmt.Sprintf("###UpdateAiAdapterMaps###, error: %v \n", err.Error()) logx.Errorf(errors.New(msg).Error()) return } if len(clusters.List) == 0 { continue } if isAdapterExist(svc, id, len(clusters.List)) { continue } else { if isAdapterEmpty(svc, id) { exeClusterMap, colClusterMap := InitAiClusterMap(&svc.Config, clusters.List) svc.Scheduler.AiService.AiExecutorAdapterMap[id] = exeClusterMap svc.Scheduler.AiService.AiCollectorAdapterMap[id] = colClusterMap } else { UpdateClusterMaps(svc, id, clusters.List) } } } } func UpdateClusterMaps(svc *svc.ServiceContext, adapterId string, clusters []types.ClusterInfo) { for _, c := range clusters { _, ok := svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] _, ok2 := svc.Scheduler.AiService.AiCollectorAdapterMap[adapterId][c.Id] if !ok && !ok2 { switch c.Name { case OCTOPUS: id, _ := strconv.ParseInt(c.Id, 10, 64) octopusRpc := octopusclient.NewOctopus(zrpc.MustNewClient(svc.Config.OctopusRpcConf)) octopus := storeLink.NewOctopusLink(octopusRpc, c.Nickname, id) svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = octopus svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = octopus case MODELARTS: id, _ := strconv.ParseInt(c.Id, 10, 64) modelArtsRpc := modelartsservice.NewModelArtsService(zrpc.MustNewClient(svc.Config.ModelArtsRpcConf)) modelArtsImgRpc := imagesservice.NewImagesService(zrpc.MustNewClient(svc.Config.ModelArtsImgRpcConf)) modelarts := storeLink.NewModelArtsLink(modelArtsRpc, modelArtsImgRpc, c.Name, id, c.Nickname) svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = modelarts svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = modelarts case SHUGUANGAI: id, _ := strconv.ParseInt(c.Id, 10, 64) aCRpc := hpcacclient.NewHpcAC(zrpc.MustNewClient(svc.Config.ACRpcConf)) sgai := storeLink.NewShuguangAi(aCRpc, c.Nickname, id) svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = sgai svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = sgai } } else { continue } } } func isAdapterExist(svc *svc.ServiceContext, id string, clusterNum int) bool { emap, ok := svc.Scheduler.AiService.AiExecutorAdapterMap[id] cmap, ok2 := svc.Scheduler.AiService.AiCollectorAdapterMap[id] if ok && ok2 { if len(emap) == clusterNum && len(cmap) == clusterNum { return true } } return false } func isAdapterEmpty(svc *svc.ServiceContext, id string) bool { _, ok := svc.Scheduler.AiService.AiExecutorAdapterMap[id] _, ok2 := svc.Scheduler.AiService.AiCollectorAdapterMap[id] if !ok && !ok2 { return true } return false } func InitAiClusterMap(conf *config.Config, clusters []types.ClusterInfo) (map[string]executor.AiExecutor, map[string]collector.AiCollector) { executorMap := make(map[string]executor.AiExecutor) collectorMap := make(map[string]collector.AiCollector) for _, c := range clusters { switch c.Name { case OCTOPUS: id, _ := strconv.ParseInt(c.Id, 10, 64) octopusRpc := octopusclient.NewOctopus(zrpc.MustNewClient(conf.OctopusRpcConf)) octopus := storeLink.NewOctopusLink(octopusRpc, c.Nickname, id) collectorMap[c.Id] = octopus executorMap[c.Id] = octopus case MODELARTS: id, _ := strconv.ParseInt(c.Id, 10, 64) modelArtsRpc := modelartsservice.NewModelArtsService(zrpc.MustNewClient(conf.ModelArtsRpcConf)) modelArtsImgRpc := imagesservice.NewImagesService(zrpc.MustNewClient(conf.ModelArtsImgRpcConf)) modelarts := storeLink.NewModelArtsLink(modelArtsRpc, modelArtsImgRpc, c.Name, id, c.Nickname) collectorMap[c.Id] = modelarts executorMap[c.Id] = modelarts case SHUGUANGAI: id, _ := strconv.ParseInt(c.Id, 10, 64) aCRpc := hpcacclient.NewHpcAC(zrpc.MustNewClient(conf.ACRpcConf)) sgai := storeLink.NewShuguangAi(aCRpc, c.Nickname, id) collectorMap[c.Id] = sgai executorMap[c.Id] = sgai } } return executorMap, collectorMap } func UpdateClusterResource(svc *svc.ServiceContext) { list, err := svc.Scheduler.AiStorages.GetAdaptersByType("1") if err != nil { return } var wg sync.WaitGroup for _, adapter := range list { clusters, err := svc.Scheduler.AiStorages.GetClustersByAdapterId(adapter.Id) if err != nil { continue } for _, cluster := range clusters.List { c := cluster clusterResource, err := svc.Scheduler.AiStorages.GetClusterResourcesById(c.Id) if err != nil { continue } wg.Add(1) go func() { _, ok := svc.Scheduler.AiService.AiCollectorAdapterMap[adapter.Id][c.Id] if !ok { wg.Done() return } h := http.Request{} stat, err := svc.Scheduler.AiService.AiCollectorAdapterMap[adapter.Id][c.Id].GetResourceStats(h.Context()) if err != nil { wg.Done() return } if stat == nil { wg.Done() return } clusterType, err := strconv.ParseInt(adapter.Type, 10, 64) if err != nil { wg.Done() return } var cardTotal int64 var topsTotal float64 for _, card := range stat.CardsAvail { cardTotal += int64(card.CardNum) topsTotal += card.TOpsAtFp16 * float64(card.CardNum) } if (models.TClusterResource{} == *clusterResource) { err = svc.Scheduler.AiStorages.SaveClusterResources(adapter.Id, c.Id, c.Name, clusterType, float64(stat.CpuCoreAvail), float64(stat.CpuCoreTotal), stat.MemAvail, stat.MemTotal, stat.DiskAvail, stat.DiskTotal, float64(stat.GpuAvail), float64(stat.GpuTotal), cardTotal, topsTotal) if err != nil { wg.Done() return } } else { if stat.CpuCoreTotal == 0 || stat.MemTotal == 0 || stat.DiskTotal == 0 { wg.Done() return } clusterResource.CardTotal = cardTotal clusterResource.CardTopsTotal = topsTotal clusterResource.CpuAvail = float64(stat.CpuCoreAvail) clusterResource.CpuTotal = float64(stat.CpuCoreTotal) clusterResource.MemAvail = stat.MemAvail clusterResource.MemTotal = stat.MemTotal clusterResource.DiskAvail = stat.DiskAvail clusterResource.DiskTotal = stat.DiskTotal err := svc.Scheduler.AiStorages.UpdateClusterResources(clusterResource) if err != nil { wg.Done() return } } wg.Done() }() } } wg.Wait() }