You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

aiService.go 6.5 kB

4 months ago
4 months ago
4 months ago
11 months ago
11 months ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. package service
  2. import (
  3. "github.com/zeromicro/go-zero/zrpc"
  4. "gitlink.org.cn/JointCloud/pcm-ac/hpcacclient"
  5. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/config"
  6. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/database"
  7. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
  8. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/executor"
  9. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference"
  10. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/task/tasksync"
  11. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/storeLink"
  12. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/storeLink/octopusHttp"
  13. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
  14. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
  15. "gitlink.org.cn/JointCloud/pcm-modelarts/client/imagesservice"
  16. "gitlink.org.cn/JointCloud/pcm-modelarts/client/modelartsservice"
  17. "gitlink.org.cn/JointCloud/pcm-octopus/octopusclient"
  18. "strconv"
  19. "sync"
  20. "time"
  21. )
  22. const (
  23. OCTOPUS = "octopus"
  24. MODELARTS = "modelarts"
  25. SHUGUANGAI = "shuguangAi"
  26. OPENI = "openI"
  27. )
  28. type AiService struct {
  29. AiExecutorAdapterMap map[string]map[string]executor.AiExecutor
  30. AiCollectorAdapterMap map[string]map[string]collector.AiCollector
  31. InferenceAdapterMap map[string]map[string]inference.ICluster
  32. Storage *database.AiStorage
  33. LocalCache map[string]interface{}
  34. Conf *config.Config
  35. TaskSyncLock sync.Mutex
  36. St *tasksync.SyncTrain
  37. Si *tasksync.SyncInfer
  38. }
  39. func NewAiService(conf *config.Config, storages *database.AiStorage, localCache map[string]interface{}) (*AiService, error) {
  40. var aiType = "1"
  41. adapterIds, err := storages.GetAdapterIdsByType(aiType)
  42. if err != nil {
  43. return nil, err
  44. }
  45. aiService := &AiService{
  46. AiExecutorAdapterMap: make(map[string]map[string]executor.AiExecutor),
  47. AiCollectorAdapterMap: make(map[string]map[string]collector.AiCollector),
  48. InferenceAdapterMap: make(map[string]map[string]inference.ICluster),
  49. Storage: storages,
  50. LocalCache: localCache,
  51. Conf: conf,
  52. }
  53. for _, id := range adapterIds {
  54. clusters, err := storages.GetClustersByAdapterId(id)
  55. if err != nil {
  56. return nil, err
  57. }
  58. if len(clusters.List) == 0 {
  59. continue
  60. }
  61. exeClusterMap, colClusterMap, inferMap := InitAiClusterMap(conf, clusters.List)
  62. aiService.AiExecutorAdapterMap[id] = exeClusterMap
  63. aiService.AiCollectorAdapterMap[id] = colClusterMap
  64. aiService.InferenceAdapterMap[id] = inferMap
  65. }
  66. st := tasksync.NewTrainTask(storages, aiService.AiCollectorAdapterMap, conf)
  67. si := tasksync.NewInferTask(storages, aiService.InferenceAdapterMap, conf)
  68. aiService.St = st
  69. aiService.Si = si
  70. return aiService, nil
  71. }
  72. func InitAiClusterMap(conf *config.Config, clusters []types.ClusterInfo) (map[string]executor.AiExecutor, map[string]collector.AiCollector, map[string]inference.ICluster) {
  73. executorMap := make(map[string]executor.AiExecutor)
  74. collectorMap := make(map[string]collector.AiCollector)
  75. inferenceMap := make(map[string]inference.ICluster)
  76. for _, c := range clusters {
  77. switch c.Driver {
  78. case OCTOPUS:
  79. id, _ := strconv.ParseInt(c.Id, 10, 64)
  80. octopus := octopusHttp.NewOctopusHttp(id, c.Nickname, c.Server, c.Address, c.Username, c.Password)
  81. collectorMap[c.Id] = octopus
  82. executorMap[c.Id] = octopus
  83. inferenceMap[c.Id] = octopus
  84. case MODELARTS:
  85. id, _ := strconv.ParseInt(c.Id, 10, 64)
  86. modelArtsRpc := modelartsservice.NewModelArtsService(zrpc.MustNewClient(conf.ModelArtsRpcConf))
  87. modelArtsImgRpc := imagesservice.NewImagesService(zrpc.MustNewClient(conf.ModelArtsImgRpcConf))
  88. modelarts := storeLink.NewModelArtsLink(modelArtsRpc, modelArtsImgRpc, c.Name, id, c.Nickname)
  89. collectorMap[c.Id] = modelarts
  90. executorMap[c.Id] = modelarts
  91. inferenceMap[c.Id] = modelarts
  92. case SHUGUANGAI:
  93. id, _ := strconv.ParseInt(c.Id, 10, 64)
  94. aCRpc := hpcacclient.NewHpcAC(zrpc.MustNewClient(conf.ACRpcConf))
  95. sgai := storeLink.NewShuguangAi(aCRpc, c.Nickname, id)
  96. collectorMap[c.Id] = sgai
  97. executorMap[c.Id] = sgai
  98. inferenceMap[c.Id] = sgai
  99. case OPENI:
  100. id, _ := strconv.ParseInt(c.Id, 10, 64)
  101. openi := storeLink.NewOpenI(c.Server, id, c.Username, c.Token, c.Nickname)
  102. collectorMap[c.Id] = openi
  103. executorMap[c.Id] = openi
  104. inferenceMap[c.Id] = openi
  105. }
  106. }
  107. return executorMap, collectorMap, inferenceMap
  108. }
  109. func (as *AiService) UpdateClusterMaps(conf *config.Config, adapterId string, clusters []types.ClusterInfo) {
  110. for _, c := range clusters {
  111. _, ok := as.AiExecutorAdapterMap[adapterId][c.Id]
  112. _, ok2 := as.AiCollectorAdapterMap[adapterId][c.Id]
  113. _, ok3 := as.InferenceAdapterMap[adapterId][c.Id]
  114. if !ok && !ok2 && !ok3 {
  115. switch c.Name {
  116. case OCTOPUS:
  117. id, _ := strconv.ParseInt(c.Id, 10, 64)
  118. octopusRpc := octopusclient.NewOctopus(zrpc.MustNewClient(conf.OctopusRpcConf))
  119. octopus := storeLink.NewOctopusLink(octopusRpc, c.Nickname, id)
  120. as.AiExecutorAdapterMap[adapterId][c.Id] = octopus
  121. as.AiCollectorAdapterMap[adapterId][c.Id] = octopus
  122. as.InferenceAdapterMap[adapterId][c.Id] = octopus
  123. case MODELARTS:
  124. id, _ := strconv.ParseInt(c.Id, 10, 64)
  125. modelArtsRpc := modelartsservice.NewModelArtsService(zrpc.MustNewClient(conf.ModelArtsRpcConf))
  126. modelArtsImgRpc := imagesservice.NewImagesService(zrpc.MustNewClient(conf.ModelArtsImgRpcConf))
  127. modelarts := storeLink.NewModelArtsLink(modelArtsRpc, modelArtsImgRpc, c.Name, id, c.Nickname)
  128. as.AiExecutorAdapterMap[adapterId][c.Id] = modelarts
  129. as.AiCollectorAdapterMap[adapterId][c.Id] = modelarts
  130. as.InferenceAdapterMap[adapterId][c.Id] = modelarts
  131. case SHUGUANGAI:
  132. id, _ := strconv.ParseInt(c.Id, 10, 64)
  133. aCRpc := hpcacclient.NewHpcAC(zrpc.MustNewClient(conf.ACRpcConf))
  134. sgai := storeLink.NewShuguangAi(aCRpc, c.Nickname, id)
  135. as.AiExecutorAdapterMap[adapterId][c.Id] = sgai
  136. as.AiCollectorAdapterMap[adapterId][c.Id] = sgai
  137. as.InferenceAdapterMap[adapterId][c.Id] = sgai
  138. }
  139. } else {
  140. continue
  141. }
  142. }
  143. }
  144. func (as *AiService) HandleDuplicateTaskName(name string, taskType string) (string, error) {
  145. exist, err := as.Storage.DoesTaskNameExist(name, taskType)
  146. if err != nil {
  147. return "", err
  148. }
  149. if exist {
  150. return name + "_" + time.Now().Format(constants.Layout_Time_Suffix), nil
  151. }
  152. return name, nil
  153. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.