You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

aiCronTask.go 15 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479
  1. package cron
  2. import (
  3. "errors"
  4. "fmt"
  5. "github.com/zeromicro/go-zero/core/logx"
  6. "github.com/zeromicro/go-zero/zrpc"
  7. "gitlink.org.cn/JointCloud/pcm-ac/hpcacclient"
  8. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/config"
  9. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/collector"
  10. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/executor"
  11. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/storeLink"
  12. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
  13. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
  14. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
  15. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
  16. "gitlink.org.cn/JointCloud/pcm-modelarts/client/imagesservice"
  17. "gitlink.org.cn/JointCloud/pcm-modelarts/client/modelartsservice"
  18. "gitlink.org.cn/JointCloud/pcm-octopus/octopusclient"
  19. "google.golang.org/grpc/codes"
  20. "google.golang.org/grpc/status"
  21. "net/http"
  22. "strconv"
  23. "sync"
  24. "time"
  25. )
  26. const (
  27. OCTOPUS = "octopus"
  28. MODELARTS = "modelarts"
  29. SHUGUANGAI = "shuguangAi"
  30. )
  31. func GetTaskList(svc *svc.ServiceContext) ([]*types.TaskModel, error) {
  32. limit := 10
  33. offset := 0
  34. var list []*types.TaskModel
  35. db := svc.DbEngin.Model(&types.TaskModel{}).Table("task")
  36. db = db.Where("deleted_at is null")
  37. //count total
  38. var total int64
  39. err := db.Count(&total).Error
  40. db.Limit(limit).Offset(offset)
  41. if err != nil {
  42. return nil, err
  43. }
  44. err = db.Order("created_time desc").Find(&list).Error
  45. if err != nil {
  46. return nil, err
  47. }
  48. return list, nil
  49. }
  50. func UpdateAiTaskStatus(svc *svc.ServiceContext, tasklist []*types.TaskModel) {
  51. list := make([]*types.TaskModel, len(tasklist))
  52. copy(list, tasklist)
  53. for i := len(list) - 1; i >= 0; i-- {
  54. if list[i].AdapterTypeDict != 1 || list[i].Status == constants.Succeeded || list[i].Status == constants.Failed {
  55. list = append(list[:i], list[i+1:]...)
  56. }
  57. }
  58. if len(list) == 0 {
  59. return
  60. }
  61. task := list[0]
  62. for i := range list {
  63. earliest, _ := time.Parse(constants.Layout, task.UpdatedTime)
  64. latest, _ := time.Parse(constants.Layout, list[i].UpdatedTime)
  65. if latest.Before(earliest) {
  66. task = list[i]
  67. }
  68. }
  69. var aiTaskList []*models.TaskAi
  70. tx := svc.DbEngin.Raw("select * from task_ai where `task_id` = ? ", task.Id).Scan(&aiTaskList)
  71. if tx.Error != nil {
  72. logx.Errorf(tx.Error.Error())
  73. return
  74. }
  75. if len(aiTaskList) == 0 {
  76. return
  77. }
  78. var wg sync.WaitGroup
  79. for _, aitask := range aiTaskList {
  80. t := aitask
  81. if t.Status == constants.Completed || t.Status == constants.Failed {
  82. continue
  83. }
  84. wg.Add(1)
  85. go func() {
  86. h := http.Request{}
  87. trainingTask, err := svc.Scheduler.AiService.AiCollectorAdapterMap[strconv.FormatInt(t.AdapterId, 10)][strconv.FormatInt(t.ClusterId, 10)].GetTrainingTask(h.Context(), t.JobId)
  88. if err != nil {
  89. if status.Code(err) == codes.DeadlineExceeded {
  90. msg := fmt.Sprintf("###UpdateAiTaskStatus###, AiTaskId: %v, clusterId: %v , JobId: %v, error: %v \n", t.Id, t.ClusterId, t.JobId, err.Error())
  91. logx.Errorf(errors.New(msg).Error())
  92. wg.Done()
  93. return
  94. }
  95. msg := fmt.Sprintf("###UpdateAiTaskStatus###, AiTaskId: %v, clusterId: %v , JobId: %v, error: %v \n", t.Id, t.ClusterId, t.JobId, err.Error())
  96. logx.Errorf(errors.New(msg).Error())
  97. wg.Done()
  98. return
  99. }
  100. if trainingTask == nil {
  101. wg.Done()
  102. return
  103. }
  104. switch trainingTask.Status {
  105. case constants.Running:
  106. if t.Status != trainingTask.Status {
  107. svc.Scheduler.AiStorages.AddNoticeInfo(strconv.FormatInt(t.AdapterId, 10), t.AdapterName, strconv.FormatInt(t.ClusterId, 10), t.ClusterName, t.Name, "running", "任务运行中")
  108. t.Status = trainingTask.Status
  109. }
  110. case constants.Failed:
  111. if t.Status != trainingTask.Status {
  112. svc.Scheduler.AiStorages.AddNoticeInfo(strconv.FormatInt(t.AdapterId, 10), t.AdapterName, strconv.FormatInt(t.ClusterId, 10), t.ClusterName, t.Name, "failed", "任务失败")
  113. t.Status = trainingTask.Status
  114. }
  115. case constants.Completed:
  116. if t.Status != trainingTask.Status {
  117. svc.Scheduler.AiStorages.AddNoticeInfo(strconv.FormatInt(t.AdapterId, 10), t.AdapterName, strconv.FormatInt(t.ClusterId, 10), t.ClusterName, t.Name, "completed", "任务完成")
  118. t.Status = trainingTask.Status
  119. }
  120. default:
  121. if t.Status != trainingTask.Status {
  122. svc.Scheduler.AiStorages.AddNoticeInfo(strconv.FormatInt(t.AdapterId, 10), t.AdapterName, strconv.FormatInt(t.ClusterId, 10), t.ClusterName, t.Name, "pending", "任务pending")
  123. t.Status = trainingTask.Status
  124. }
  125. }
  126. t.StartTime = trainingTask.Start
  127. t.EndTime = trainingTask.End
  128. err = svc.Scheduler.AiStorages.UpdateAiTask(t)
  129. if err != nil {
  130. msg := fmt.Sprintf("###UpdateAiTaskStatus###, AiTaskId: %v, clusterId: %v , JobId: %v, error: %v \n", t.Id, t.ClusterId, t.JobId, err.Error())
  131. logx.Errorf(errors.New(msg).Error())
  132. wg.Done()
  133. return
  134. }
  135. wg.Done()
  136. }()
  137. }
  138. wg.Wait()
  139. }
  140. func UpdateTaskStatus(svc *svc.ServiceContext, tasklist []*types.TaskModel) {
  141. list := make([]*types.TaskModel, len(tasklist))
  142. copy(list, tasklist)
  143. for i := len(list) - 1; i >= 0; i-- {
  144. if list[i].AdapterTypeDict != 1 || list[i].Status == constants.Succeeded || list[i].Status == constants.Failed {
  145. list = append(list[:i], list[i+1:]...)
  146. }
  147. }
  148. if len(list) == 0 {
  149. return
  150. }
  151. task := list[0]
  152. for i := range list {
  153. earliest, _ := time.Parse(time.RFC3339, task.UpdatedTime)
  154. latest, _ := time.Parse(time.RFC3339, list[i].UpdatedTime)
  155. if latest.Before(earliest) {
  156. task = list[i]
  157. }
  158. }
  159. var aiTask []*models.TaskAi
  160. tx := svc.DbEngin.Raw("select * from task_ai where `task_id` = ? ", task.Id).Scan(&aiTask)
  161. if tx.Error != nil {
  162. logx.Errorf(tx.Error.Error())
  163. return
  164. }
  165. if len(aiTask) == 0 {
  166. tx = svc.DbEngin.Model(task).Table("task").Where("deleted_at is null").Updates(task)
  167. if tx.Error != nil {
  168. logx.Errorf(tx.Error.Error())
  169. return
  170. }
  171. return
  172. }
  173. if len(aiTask) == 1 {
  174. if aiTask[0].Status == constants.Completed {
  175. task.Status = constants.Succeeded
  176. } else {
  177. task.Status = aiTask[0].Status
  178. }
  179. task.StartTime = aiTask[0].StartTime
  180. task.EndTime = aiTask[0].EndTime
  181. task.UpdatedTime = time.Now().Format(constants.Layout)
  182. tx = svc.DbEngin.Model(task).Table("task").Where("deleted_at is null").Updates(task)
  183. if tx.Error != nil {
  184. logx.Errorf(tx.Error.Error())
  185. return
  186. }
  187. return
  188. }
  189. for i := len(aiTask) - 1; i >= 0; i-- {
  190. if aiTask[i].StartTime == "" {
  191. task.Status = aiTask[i].Status
  192. aiTask = append(aiTask[:i], aiTask[i+1:]...)
  193. }
  194. }
  195. if len(aiTask) == 0 {
  196. task.UpdatedTime = time.Now().Format(constants.Layout)
  197. tx = svc.DbEngin.Table("task").Model(task).Updates(task)
  198. if tx.Error != nil {
  199. logx.Errorf(tx.Error.Error())
  200. return
  201. }
  202. return
  203. }
  204. start, _ := time.ParseInLocation(constants.Layout, aiTask[0].StartTime, time.Local)
  205. end, _ := time.ParseInLocation(constants.Layout, aiTask[0].EndTime, time.Local)
  206. var status string
  207. var count int
  208. for _, a := range aiTask {
  209. s, _ := time.ParseInLocation(constants.Layout, a.StartTime, time.Local)
  210. e, _ := time.ParseInLocation(constants.Layout, a.EndTime, time.Local)
  211. if s.Before(start) {
  212. start = s
  213. }
  214. if e.After(end) {
  215. end = e
  216. }
  217. if a.Status == constants.Failed {
  218. status = a.Status
  219. break
  220. }
  221. if a.Status == constants.Pending {
  222. status = a.Status
  223. continue
  224. }
  225. if a.Status == constants.Running {
  226. status = a.Status
  227. continue
  228. }
  229. if a.Status == constants.Completed {
  230. count++
  231. continue
  232. }
  233. }
  234. if count == len(aiTask) {
  235. status = constants.Succeeded
  236. }
  237. if status != "" {
  238. task.Status = status
  239. task.StartTime = start.Format(constants.Layout)
  240. task.EndTime = end.Format(constants.Layout)
  241. }
  242. task.UpdatedTime = time.Now().Format(constants.Layout)
  243. tx = svc.DbEngin.Table("task").Model(task).Updates(task)
  244. if tx.Error != nil {
  245. logx.Errorf(tx.Error.Error())
  246. return
  247. }
  248. }
  249. func UpdateAiAdapterMaps(svc *svc.ServiceContext) {
  250. var aiType = "1"
  251. adapterIds, err := svc.Scheduler.AiStorages.GetAdapterIdsByType(aiType)
  252. if err != nil {
  253. msg := fmt.Sprintf("###UpdateAiAdapterMaps###, error: %v \n", err.Error())
  254. logx.Errorf(errors.New(msg).Error())
  255. return
  256. }
  257. if len(adapterIds) == 0 {
  258. return
  259. }
  260. for _, id := range adapterIds {
  261. clusters, err := svc.Scheduler.AiStorages.GetClustersByAdapterId(id)
  262. if err != nil {
  263. msg := fmt.Sprintf("###UpdateAiAdapterMaps###, error: %v \n", err.Error())
  264. logx.Errorf(errors.New(msg).Error())
  265. return
  266. }
  267. if len(clusters.List) == 0 {
  268. continue
  269. }
  270. if isAdapterExist(svc, id, len(clusters.List)) {
  271. continue
  272. } else {
  273. if isAdapterEmpty(svc, id) {
  274. exeClusterMap, colClusterMap := InitAiClusterMap(&svc.Config, clusters.List)
  275. svc.Scheduler.AiService.AiExecutorAdapterMap[id] = exeClusterMap
  276. svc.Scheduler.AiService.AiCollectorAdapterMap[id] = colClusterMap
  277. } else {
  278. UpdateClusterMaps(svc, id, clusters.List)
  279. }
  280. }
  281. }
  282. }
  283. func UpdateClusterMaps(svc *svc.ServiceContext, adapterId string, clusters []types.ClusterInfo) {
  284. for _, c := range clusters {
  285. _, ok := svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id]
  286. _, ok2 := svc.Scheduler.AiService.AiCollectorAdapterMap[adapterId][c.Id]
  287. if !ok && !ok2 {
  288. switch c.Name {
  289. case OCTOPUS:
  290. id, _ := strconv.ParseInt(c.Id, 10, 64)
  291. octopusRpc := octopusclient.NewOctopus(zrpc.MustNewClient(svc.Config.OctopusRpcConf))
  292. octopus := storeLink.NewOctopusLink(octopusRpc, c.Nickname, id)
  293. svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = octopus
  294. svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = octopus
  295. case MODELARTS:
  296. id, _ := strconv.ParseInt(c.Id, 10, 64)
  297. modelArtsRpc := modelartsservice.NewModelArtsService(zrpc.MustNewClient(svc.Config.ModelArtsRpcConf))
  298. modelArtsImgRpc := imagesservice.NewImagesService(zrpc.MustNewClient(svc.Config.ModelArtsImgRpcConf))
  299. modelarts := storeLink.NewModelArtsLink(modelArtsRpc, modelArtsImgRpc, c.Name, id, c.Nickname)
  300. svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = modelarts
  301. svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = modelarts
  302. case SHUGUANGAI:
  303. id, _ := strconv.ParseInt(c.Id, 10, 64)
  304. aCRpc := hpcacclient.NewHpcAC(zrpc.MustNewClient(svc.Config.ACRpcConf))
  305. sgai := storeLink.NewShuguangAi(aCRpc, c.Nickname, id)
  306. svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = sgai
  307. svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = sgai
  308. }
  309. } else {
  310. continue
  311. }
  312. }
  313. }
  314. func isAdapterExist(svc *svc.ServiceContext, id string, clusterNum int) bool {
  315. emap, ok := svc.Scheduler.AiService.AiExecutorAdapterMap[id]
  316. cmap, ok2 := svc.Scheduler.AiService.AiCollectorAdapterMap[id]
  317. if ok && ok2 {
  318. if len(emap) == clusterNum && len(cmap) == clusterNum {
  319. return true
  320. }
  321. }
  322. return false
  323. }
  324. func isAdapterEmpty(svc *svc.ServiceContext, id string) bool {
  325. _, ok := svc.Scheduler.AiService.AiExecutorAdapterMap[id]
  326. _, ok2 := svc.Scheduler.AiService.AiCollectorAdapterMap[id]
  327. if !ok && !ok2 {
  328. return true
  329. }
  330. return false
  331. }
  332. func InitAiClusterMap(conf *config.Config, clusters []types.ClusterInfo) (map[string]executor.AiExecutor, map[string]collector.AiCollector) {
  333. executorMap := make(map[string]executor.AiExecutor)
  334. collectorMap := make(map[string]collector.AiCollector)
  335. for _, c := range clusters {
  336. switch c.Name {
  337. case OCTOPUS:
  338. id, _ := strconv.ParseInt(c.Id, 10, 64)
  339. octopusRpc := octopusclient.NewOctopus(zrpc.MustNewClient(conf.OctopusRpcConf))
  340. octopus := storeLink.NewOctopusLink(octopusRpc, c.Nickname, id)
  341. collectorMap[c.Id] = octopus
  342. executorMap[c.Id] = octopus
  343. case MODELARTS:
  344. id, _ := strconv.ParseInt(c.Id, 10, 64)
  345. modelArtsRpc := modelartsservice.NewModelArtsService(zrpc.MustNewClient(conf.ModelArtsRpcConf))
  346. modelArtsImgRpc := imagesservice.NewImagesService(zrpc.MustNewClient(conf.ModelArtsImgRpcConf))
  347. modelarts := storeLink.NewModelArtsLink(modelArtsRpc, modelArtsImgRpc, c.Name, id, c.Nickname)
  348. collectorMap[c.Id] = modelarts
  349. executorMap[c.Id] = modelarts
  350. case SHUGUANGAI:
  351. id, _ := strconv.ParseInt(c.Id, 10, 64)
  352. aCRpc := hpcacclient.NewHpcAC(zrpc.MustNewClient(conf.ACRpcConf))
  353. sgai := storeLink.NewShuguangAi(aCRpc, c.Nickname, id)
  354. collectorMap[c.Id] = sgai
  355. executorMap[c.Id] = sgai
  356. }
  357. }
  358. return executorMap, collectorMap
  359. }
  360. func UpdateClusterResource(svc *svc.ServiceContext) {
  361. list, err := svc.Scheduler.AiStorages.GetAdaptersByType("1")
  362. if err != nil {
  363. return
  364. }
  365. var wg sync.WaitGroup
  366. for _, adapter := range list {
  367. clusters, err := svc.Scheduler.AiStorages.GetClustersByAdapterId(adapter.Id)
  368. if err != nil {
  369. continue
  370. }
  371. for _, cluster := range clusters.List {
  372. c := cluster
  373. clusterResource, err := svc.Scheduler.AiStorages.GetClusterResourcesById(c.Id)
  374. if err != nil {
  375. continue
  376. }
  377. wg.Add(1)
  378. go func() {
  379. _, ok := svc.Scheduler.AiService.AiCollectorAdapterMap[adapter.Id][c.Id]
  380. if !ok {
  381. wg.Done()
  382. return
  383. }
  384. h := http.Request{}
  385. stat, err := svc.Scheduler.AiService.AiCollectorAdapterMap[adapter.Id][c.Id].GetResourceStats(h.Context())
  386. if err != nil {
  387. wg.Done()
  388. return
  389. }
  390. if stat == nil {
  391. wg.Done()
  392. return
  393. }
  394. clusterType, err := strconv.ParseInt(adapter.Type, 10, 64)
  395. if err != nil {
  396. wg.Done()
  397. return
  398. }
  399. var cardTotal int64
  400. var topsTotal float64
  401. for _, card := range stat.CardsAvail {
  402. cardTotal += int64(card.CardNum)
  403. topsTotal += card.TOpsAtFp16 * float64(card.CardNum)
  404. }
  405. if (models.TClusterResource{} == *clusterResource) {
  406. err = svc.Scheduler.AiStorages.SaveClusterResources(adapter.Id, c.Id, c.Name, clusterType, float64(stat.CpuCoreAvail), float64(stat.CpuCoreTotal),
  407. stat.MemAvail, stat.MemTotal, stat.DiskAvail, stat.DiskTotal, float64(stat.GpuAvail), float64(stat.GpuTotal), cardTotal, topsTotal)
  408. if err != nil {
  409. wg.Done()
  410. return
  411. }
  412. } else {
  413. if stat.CpuCoreTotal == 0 || stat.MemTotal == 0 || stat.DiskTotal == 0 {
  414. wg.Done()
  415. return
  416. }
  417. clusterResource.CardTotal = cardTotal
  418. clusterResource.CardTopsTotal = topsTotal
  419. clusterResource.CpuAvail = float64(stat.CpuCoreAvail)
  420. clusterResource.CpuTotal = float64(stat.CpuCoreTotal)
  421. clusterResource.MemAvail = stat.MemAvail
  422. clusterResource.MemTotal = stat.MemTotal
  423. clusterResource.DiskAvail = stat.DiskAvail
  424. clusterResource.DiskTotal = stat.DiskTotal
  425. err := svc.Scheduler.AiStorages.UpdateClusterResources(clusterResource)
  426. if err != nil {
  427. wg.Done()
  428. return
  429. }
  430. }
  431. wg.Done()
  432. }()
  433. }
  434. }
  435. wg.Wait()
  436. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.