|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250 |
- package cron
-
- import (
- "errors"
- "fmt"
- "github.com/zeromicro/go-zero/core/logx"
- "github.com/zeromicro/go-zero/zrpc"
- hpcacclient "gitlink.org.cn/JointCloud/pcm-ac/hpcacclient"
- "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/config"
- "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/collector"
- "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/executor"
- "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/storeLink"
- "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
- "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
- "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
- "gitlink.org.cn/JointCloud/pcm-modelarts/client/imagesservice"
- "gitlink.org.cn/JointCloud/pcm-modelarts/client/modelartsservice"
- "gitlink.org.cn/JointCloud/pcm-octopus/octopusclient"
- "net/http"
- "strconv"
- "sync"
- )
-
- const (
- OCTOPUS = "octopus"
- MODELARTS = "modelarts"
- SHUGUANGAI = "shuguangAi"
- )
-
- func GetTaskList(svc *svc.ServiceContext) ([]*types.TaskModel, error) {
- limit := 10
- offset := 0
- var list []*types.TaskModel
- db := svc.DbEngin.Model(&types.TaskModel{}).Table("task")
-
- db = db.Where("deleted_at is null")
-
- //count total
- var total int64
- err := db.Count(&total).Error
- db.Limit(limit).Offset(offset)
-
- if err != nil {
- return nil, err
- }
- err = db.Order("created_time desc").Find(&list).Error
- if err != nil {
- return nil, err
- }
- return list, nil
- }
-
- func UpdateAiAdapterMaps(svc *svc.ServiceContext) {
- var aiType = "1"
- adapterIds, err := svc.Scheduler.AiStorages.GetAdapterIdsByType(aiType)
- if err != nil {
- msg := fmt.Sprintf("###UpdateAiAdapterMaps###, error: %v \n", err.Error())
- logx.Errorf(errors.New(msg).Error())
- return
- }
- if len(adapterIds) == 0 {
- return
- }
-
- for _, id := range adapterIds {
- clusters, err := svc.Scheduler.AiStorages.GetClustersByAdapterId(id)
- if err != nil {
- msg := fmt.Sprintf("###UpdateAiAdapterMaps###, error: %v \n", err.Error())
- logx.Errorf(errors.New(msg).Error())
- return
- }
- if len(clusters.List) == 0 {
- continue
- }
- if isAdapterExist(svc, id, len(clusters.List)) {
- continue
- } else {
- if isAdapterEmpty(svc, id) {
- exeClusterMap, colClusterMap := InitAiClusterMap(&svc.Config, clusters.List)
- svc.Scheduler.AiService.AiExecutorAdapterMap[id] = exeClusterMap
- svc.Scheduler.AiService.AiCollectorAdapterMap[id] = colClusterMap
- } else {
- UpdateClusterMaps(svc, id, clusters.List)
- }
- }
- }
- }
-
- func UpdateClusterMaps(svc *svc.ServiceContext, adapterId string, clusters []types.ClusterInfo) {
- for _, c := range clusters {
- _, ok := svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id]
- _, ok2 := svc.Scheduler.AiService.AiCollectorAdapterMap[adapterId][c.Id]
- if !ok && !ok2 {
- switch c.Name {
- case OCTOPUS:
- id, _ := strconv.ParseInt(c.Id, 10, 64)
- octopusRpc := octopusclient.NewOctopus(zrpc.MustNewClient(svc.Config.OctopusRpcConf))
- octopus := storeLink.NewOctopusLink(octopusRpc, c.Nickname, id)
- svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = octopus
- svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = octopus
- case MODELARTS:
- id, _ := strconv.ParseInt(c.Id, 10, 64)
- modelArtsRpc := modelartsservice.NewModelArtsService(zrpc.MustNewClient(svc.Config.ModelArtsRpcConf))
- modelArtsImgRpc := imagesservice.NewImagesService(zrpc.MustNewClient(svc.Config.ModelArtsImgRpcConf))
- modelarts := storeLink.NewModelArtsLink(modelArtsRpc, modelArtsImgRpc, c.Name, id, c.Nickname)
- svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = modelarts
- svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = modelarts
- case SHUGUANGAI:
- id, _ := strconv.ParseInt(c.Id, 10, 64)
- aCRpc := hpcacclient.NewHpcAC(zrpc.MustNewClient(svc.Config.ACRpcConf))
- sgai := storeLink.NewShuguangAi(aCRpc, c.Nickname, id)
- svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = sgai
- svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = sgai
- }
- } else {
- continue
- }
- }
-
- }
-
- func isAdapterExist(svc *svc.ServiceContext, id string, clusterNum int) bool {
- emap, ok := svc.Scheduler.AiService.AiExecutorAdapterMap[id]
- cmap, ok2 := svc.Scheduler.AiService.AiCollectorAdapterMap[id]
- if ok && ok2 {
- if len(emap) == clusterNum && len(cmap) == clusterNum {
- return true
- }
- }
- return false
- }
-
- func isAdapterEmpty(svc *svc.ServiceContext, id string) bool {
- _, ok := svc.Scheduler.AiService.AiExecutorAdapterMap[id]
- _, ok2 := svc.Scheduler.AiService.AiCollectorAdapterMap[id]
- if !ok && !ok2 {
- return true
- }
- return false
- }
-
- func InitAiClusterMap(conf *config.Config, clusters []types.ClusterInfo) (map[string]executor.AiExecutor, map[string]collector.AiCollector) {
- executorMap := make(map[string]executor.AiExecutor)
- collectorMap := make(map[string]collector.AiCollector)
- for _, c := range clusters {
- switch c.Name {
- case OCTOPUS:
- id, _ := strconv.ParseInt(c.Id, 10, 64)
- octopusRpc := octopusclient.NewOctopus(zrpc.MustNewClient(conf.OctopusRpcConf))
- octopus := storeLink.NewOctopusLink(octopusRpc, c.Nickname, id)
- collectorMap[c.Id] = octopus
- executorMap[c.Id] = octopus
- case MODELARTS:
- id, _ := strconv.ParseInt(c.Id, 10, 64)
- modelArtsRpc := modelartsservice.NewModelArtsService(zrpc.MustNewClient(conf.ModelArtsRpcConf))
- modelArtsImgRpc := imagesservice.NewImagesService(zrpc.MustNewClient(conf.ModelArtsImgRpcConf))
- modelarts := storeLink.NewModelArtsLink(modelArtsRpc, modelArtsImgRpc, c.Name, id, c.Nickname)
- collectorMap[c.Id] = modelarts
- executorMap[c.Id] = modelarts
- case SHUGUANGAI:
- id, _ := strconv.ParseInt(c.Id, 10, 64)
- aCRpc := hpcacclient.NewHpcAC(zrpc.MustNewClient(conf.ACRpcConf))
- sgai := storeLink.NewShuguangAi(aCRpc, c.Nickname, id)
- collectorMap[c.Id] = sgai
- executorMap[c.Id] = sgai
- }
- }
-
- return executorMap, collectorMap
- }
-
- func UpdateClusterResource(svc *svc.ServiceContext) {
- list, err := svc.Scheduler.AiStorages.GetAdaptersByType("1")
- if err != nil {
- return
- }
- var wg sync.WaitGroup
- for _, adapter := range list {
- clusters, err := svc.Scheduler.AiStorages.GetClustersByAdapterId(adapter.Id)
- if err != nil {
- continue
- }
- for _, cluster := range clusters.List {
- c := cluster
- clusterResource, err := svc.Scheduler.AiStorages.GetClusterResourcesById(c.Id)
- if err != nil {
- continue
- }
- wg.Add(1)
- go func() {
- _, ok := svc.Scheduler.AiService.AiCollectorAdapterMap[adapter.Id][c.Id]
- if !ok {
- wg.Done()
- return
- }
- h := http.Request{}
- stat, err := svc.Scheduler.AiService.AiCollectorAdapterMap[adapter.Id][c.Id].GetResourceStats(h.Context())
- if err != nil {
- wg.Done()
- return
- }
- if stat == nil {
- wg.Done()
- return
- }
- clusterType, err := strconv.ParseInt(adapter.Type, 10, 64)
- if err != nil {
- wg.Done()
- return
- }
- var cardTotal int64
- var topsTotal float64
- for _, card := range stat.CardsAvail {
- cardTotal += int64(card.CardNum)
- topsTotal += card.TOpsAtFp16 * float64(card.CardNum)
- }
-
- if (models.TClusterResource{} == *clusterResource) {
- err = svc.Scheduler.AiStorages.SaveClusterResources(adapter.Id, c.Id, c.Name, clusterType, float64(stat.CpuCoreAvail), float64(stat.CpuCoreTotal),
- stat.MemAvail, stat.MemTotal, stat.DiskAvail, stat.DiskTotal, float64(stat.GpuAvail), float64(stat.GpuTotal), cardTotal, topsTotal)
- if err != nil {
- wg.Done()
- return
- }
- } else {
- if stat.CpuCoreTotal == 0 || stat.MemTotal == 0 || stat.DiskTotal == 0 {
- wg.Done()
- return
- }
- clusterResource.CardTotal = cardTotal
- clusterResource.CardTopsTotal = topsTotal
- clusterResource.CpuAvail = float64(stat.CpuCoreAvail)
- clusterResource.CpuTotal = float64(stat.CpuCoreTotal)
- clusterResource.MemAvail = stat.MemAvail
- clusterResource.MemTotal = stat.MemTotal
- clusterResource.DiskAvail = stat.DiskAvail
- clusterResource.DiskTotal = stat.DiskTotal
-
- err := svc.Scheduler.AiStorages.UpdateClusterResources(clusterResource)
- if err != nil {
- wg.Done()
- return
- }
- }
- wg.Done()
- }()
- }
- }
- wg.Wait()
- }
|