You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

aiCronTask.go 8.2 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
  1. package cron
  2. import (
  3. "errors"
  4. "fmt"
  5. "github.com/zeromicro/go-zero/core/logx"
  6. "github.com/zeromicro/go-zero/zrpc"
  7. hpcacclient "gitlink.org.cn/JointCloud/pcm-ac/hpcacclient"
  8. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/config"
  9. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/collector"
  10. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/executor"
  11. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/storeLink"
  12. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
  13. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
  14. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
  15. "gitlink.org.cn/JointCloud/pcm-modelarts/client/imagesservice"
  16. "gitlink.org.cn/JointCloud/pcm-modelarts/client/modelartsservice"
  17. "gitlink.org.cn/JointCloud/pcm-octopus/octopusclient"
  18. "net/http"
  19. "strconv"
  20. "sync"
  21. )
  22. const (
  23. OCTOPUS = "octopus"
  24. MODELARTS = "modelarts"
  25. SHUGUANGAI = "shuguangAi"
  26. )
  27. func GetTaskList(svc *svc.ServiceContext) ([]*types.TaskModel, error) {
  28. limit := 10
  29. offset := 0
  30. var list []*types.TaskModel
  31. db := svc.DbEngin.Model(&types.TaskModel{}).Table("task")
  32. db = db.Where("deleted_at is null")
  33. //count total
  34. var total int64
  35. err := db.Count(&total).Error
  36. db.Limit(limit).Offset(offset)
  37. if err != nil {
  38. return nil, err
  39. }
  40. err = db.Order("created_time desc").Find(&list).Error
  41. if err != nil {
  42. return nil, err
  43. }
  44. return list, nil
  45. }
  46. func UpdateAiAdapterMaps(svc *svc.ServiceContext) {
  47. var aiType = "1"
  48. adapterIds, err := svc.Scheduler.AiStorages.GetAdapterIdsByType(aiType)
  49. if err != nil {
  50. msg := fmt.Sprintf("###UpdateAiAdapterMaps###, error: %v \n", err.Error())
  51. logx.Errorf(errors.New(msg).Error())
  52. return
  53. }
  54. if len(adapterIds) == 0 {
  55. return
  56. }
  57. for _, id := range adapterIds {
  58. clusters, err := svc.Scheduler.AiStorages.GetClustersByAdapterId(id)
  59. if err != nil {
  60. msg := fmt.Sprintf("###UpdateAiAdapterMaps###, error: %v \n", err.Error())
  61. logx.Errorf(errors.New(msg).Error())
  62. return
  63. }
  64. if len(clusters.List) == 0 {
  65. continue
  66. }
  67. if isAdapterExist(svc, id, len(clusters.List)) {
  68. continue
  69. } else {
  70. if isAdapterEmpty(svc, id) {
  71. exeClusterMap, colClusterMap := InitAiClusterMap(&svc.Config, clusters.List)
  72. svc.Scheduler.AiService.AiExecutorAdapterMap[id] = exeClusterMap
  73. svc.Scheduler.AiService.AiCollectorAdapterMap[id] = colClusterMap
  74. } else {
  75. UpdateClusterMaps(svc, id, clusters.List)
  76. }
  77. }
  78. }
  79. }
  80. func UpdateClusterMaps(svc *svc.ServiceContext, adapterId string, clusters []types.ClusterInfo) {
  81. for _, c := range clusters {
  82. _, ok := svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id]
  83. _, ok2 := svc.Scheduler.AiService.AiCollectorAdapterMap[adapterId][c.Id]
  84. if !ok && !ok2 {
  85. switch c.Name {
  86. case OCTOPUS:
  87. id, _ := strconv.ParseInt(c.Id, 10, 64)
  88. octopusRpc := octopusclient.NewOctopus(zrpc.MustNewClient(svc.Config.OctopusRpcConf))
  89. octopus := storeLink.NewOctopusLink(octopusRpc, c.Nickname, id)
  90. svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = octopus
  91. svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = octopus
  92. case MODELARTS:
  93. id, _ := strconv.ParseInt(c.Id, 10, 64)
  94. modelArtsRpc := modelartsservice.NewModelArtsService(zrpc.MustNewClient(svc.Config.ModelArtsRpcConf))
  95. modelArtsImgRpc := imagesservice.NewImagesService(zrpc.MustNewClient(svc.Config.ModelArtsImgRpcConf))
  96. modelarts := storeLink.NewModelArtsLink(modelArtsRpc, modelArtsImgRpc, c.Name, id, c.Nickname)
  97. svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = modelarts
  98. svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = modelarts
  99. case SHUGUANGAI:
  100. id, _ := strconv.ParseInt(c.Id, 10, 64)
  101. aCRpc := hpcacclient.NewHpcAC(zrpc.MustNewClient(svc.Config.ACRpcConf))
  102. sgai := storeLink.NewShuguangAi(aCRpc, c.Nickname, id)
  103. svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = sgai
  104. svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = sgai
  105. }
  106. } else {
  107. continue
  108. }
  109. }
  110. }
  111. func isAdapterExist(svc *svc.ServiceContext, id string, clusterNum int) bool {
  112. emap, ok := svc.Scheduler.AiService.AiExecutorAdapterMap[id]
  113. cmap, ok2 := svc.Scheduler.AiService.AiCollectorAdapterMap[id]
  114. if ok && ok2 {
  115. if len(emap) == clusterNum && len(cmap) == clusterNum {
  116. return true
  117. }
  118. }
  119. return false
  120. }
  121. func isAdapterEmpty(svc *svc.ServiceContext, id string) bool {
  122. _, ok := svc.Scheduler.AiService.AiExecutorAdapterMap[id]
  123. _, ok2 := svc.Scheduler.AiService.AiCollectorAdapterMap[id]
  124. if !ok && !ok2 {
  125. return true
  126. }
  127. return false
  128. }
  129. func InitAiClusterMap(conf *config.Config, clusters []types.ClusterInfo) (map[string]executor.AiExecutor, map[string]collector.AiCollector) {
  130. executorMap := make(map[string]executor.AiExecutor)
  131. collectorMap := make(map[string]collector.AiCollector)
  132. for _, c := range clusters {
  133. switch c.Name {
  134. case OCTOPUS:
  135. id, _ := strconv.ParseInt(c.Id, 10, 64)
  136. octopusRpc := octopusclient.NewOctopus(zrpc.MustNewClient(conf.OctopusRpcConf))
  137. octopus := storeLink.NewOctopusLink(octopusRpc, c.Nickname, id)
  138. collectorMap[c.Id] = octopus
  139. executorMap[c.Id] = octopus
  140. case MODELARTS:
  141. id, _ := strconv.ParseInt(c.Id, 10, 64)
  142. modelArtsRpc := modelartsservice.NewModelArtsService(zrpc.MustNewClient(conf.ModelArtsRpcConf))
  143. modelArtsImgRpc := imagesservice.NewImagesService(zrpc.MustNewClient(conf.ModelArtsImgRpcConf))
  144. modelarts := storeLink.NewModelArtsLink(modelArtsRpc, modelArtsImgRpc, c.Name, id, c.Nickname)
  145. collectorMap[c.Id] = modelarts
  146. executorMap[c.Id] = modelarts
  147. case SHUGUANGAI:
  148. id, _ := strconv.ParseInt(c.Id, 10, 64)
  149. aCRpc := hpcacclient.NewHpcAC(zrpc.MustNewClient(conf.ACRpcConf))
  150. sgai := storeLink.NewShuguangAi(aCRpc, c.Nickname, id)
  151. collectorMap[c.Id] = sgai
  152. executorMap[c.Id] = sgai
  153. }
  154. }
  155. return executorMap, collectorMap
  156. }
  157. func UpdateClusterResource(svc *svc.ServiceContext) {
  158. list, err := svc.Scheduler.AiStorages.GetAdaptersByType("1")
  159. if err != nil {
  160. return
  161. }
  162. var wg sync.WaitGroup
  163. for _, adapter := range list {
  164. clusters, err := svc.Scheduler.AiStorages.GetClustersByAdapterId(adapter.Id)
  165. if err != nil {
  166. continue
  167. }
  168. for _, cluster := range clusters.List {
  169. c := cluster
  170. clusterResource, err := svc.Scheduler.AiStorages.GetClusterResourcesById(c.Id)
  171. if err != nil {
  172. continue
  173. }
  174. wg.Add(1)
  175. go func() {
  176. _, ok := svc.Scheduler.AiService.AiCollectorAdapterMap[adapter.Id][c.Id]
  177. if !ok {
  178. wg.Done()
  179. return
  180. }
  181. h := http.Request{}
  182. stat, err := svc.Scheduler.AiService.AiCollectorAdapterMap[adapter.Id][c.Id].GetResourceStats(h.Context())
  183. if err != nil {
  184. wg.Done()
  185. return
  186. }
  187. if stat == nil {
  188. wg.Done()
  189. return
  190. }
  191. clusterType, err := strconv.ParseInt(adapter.Type, 10, 64)
  192. if err != nil {
  193. wg.Done()
  194. return
  195. }
  196. var cardTotal int64
  197. var topsTotal float64
  198. for _, card := range stat.CardsAvail {
  199. cardTotal += int64(card.CardNum)
  200. topsTotal += card.TOpsAtFp16 * float64(card.CardNum)
  201. }
  202. if (models.TClusterResource{} == *clusterResource) {
  203. err = svc.Scheduler.AiStorages.SaveClusterResources(adapter.Id, c.Id, c.Name, clusterType, float64(stat.CpuCoreAvail), float64(stat.CpuCoreTotal),
  204. stat.MemAvail, stat.MemTotal, stat.DiskAvail, stat.DiskTotal, float64(stat.GpuAvail), float64(stat.GpuTotal), cardTotal, topsTotal)
  205. if err != nil {
  206. wg.Done()
  207. return
  208. }
  209. } else {
  210. if stat.CpuCoreTotal == 0 || stat.MemTotal == 0 || stat.DiskTotal == 0 {
  211. wg.Done()
  212. return
  213. }
  214. clusterResource.CardTotal = cardTotal
  215. clusterResource.CardTopsTotal = topsTotal
  216. clusterResource.CpuAvail = float64(stat.CpuCoreAvail)
  217. clusterResource.CpuTotal = float64(stat.CpuCoreTotal)
  218. clusterResource.MemAvail = stat.MemAvail
  219. clusterResource.MemTotal = stat.MemTotal
  220. clusterResource.DiskAvail = stat.DiskAvail
  221. clusterResource.DiskTotal = stat.DiskTotal
  222. err := svc.Scheduler.AiStorages.UpdateClusterResources(clusterResource)
  223. if err != nil {
  224. wg.Done()
  225. return
  226. }
  227. }
  228. wg.Done()
  229. }()
  230. }
  231. }
  232. wg.Wait()
  233. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.