You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

taskStatusSync.go 11 kB

10 months ago
10 months ago
10 months ago
10 months ago

  1. package status
  2. import (
  3. "errors"
  4. "fmt"
  5. "github.com/zeromicro/go-zero/core/logx"
  6. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/jcs"
  7. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
  8. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
  9. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
  10. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
  11. "google.golang.org/grpc/codes"
  12. "google.golang.org/grpc/status"
  13. "net/http"
  14. "strconv"
  15. "sync"
  16. "time"
  17. )
  18. func UpdateTaskStatus(svc *svc.ServiceContext, tasklist []*types.TaskModel) {
  19. list := make([]*types.TaskModel, len(tasklist))
  20. copy(list, tasklist)
  21. for i := len(list) - 1; i >= 0; i-- {
  22. if list[i].AdapterTypeDict != "1" || list[i].Status == constants.Succeeded || list[i].Status == constants.Failed || list[i].Status == constants.Cancelled {
  23. list = append(list[:i], list[i+1:]...)
  24. }
  25. }
  26. if len(list) == 0 {
  27. return
  28. }
  29. task := list[0]
  30. for i := range list {
  31. earliest, _ := time.Parse(time.RFC3339, task.UpdatedTime)
  32. latest, _ := time.Parse(time.RFC3339, list[i].UpdatedTime)
  33. if latest.Before(earliest) {
  34. task = list[i]
  35. }
  36. }
  37. // Update Infer Task Status
  38. if task.TaskTypeDict == "11" || task.TaskTypeDict == "12" {
  39. updateInferTaskStatus(svc, *task)
  40. return
  41. }
  42. aiTask, err := svc.Scheduler.AiStorages.GetAiTaskListById(task.Id)
  43. if err != nil {
  44. logx.Errorf(err.Error())
  45. return
  46. }
  47. if len(aiTask) == 0 {
  48. err := svc.Scheduler.AiStorages.UpdateTask(task)
  49. if err != nil {
  50. return
  51. }
  52. return
  53. }
  54. if len(aiTask) == 1 {
  55. switch aiTask[0].Status {
  56. case constants.Completed:
  57. task.Status = constants.Succeeded
  58. _ = reportStatusMessages(svc, task, aiTask[0])
  59. case constants.Failed:
  60. task.Status = constants.Failed
  61. _ = reportStatusMessages(svc, task, aiTask[0])
  62. default:
  63. task.Status = aiTask[0].Status
  64. }
  65. task.StartTime = aiTask[0].StartTime
  66. task.EndTime = aiTask[0].EndTime
  67. err := svc.Scheduler.AiStorages.UpdateTask(task)
  68. if err != nil {
  69. return
  70. }
  71. return
  72. }
  73. for i := len(aiTask) - 1; i >= 0; i-- {
  74. if aiTask[i].StartTime == "" {
  75. task.Status = aiTask[i].Status
  76. aiTask = append(aiTask[:i], aiTask[i+1:]...)
  77. }
  78. }
  79. if len(aiTask) == 0 {
  80. err := svc.Scheduler.AiStorages.UpdateTask(task)
  81. if err != nil {
  82. return
  83. }
  84. return
  85. }
  86. start, _ := time.ParseInLocation(constants.Layout, aiTask[0].StartTime, time.Local)
  87. end, _ := time.ParseInLocation(constants.Layout, aiTask[0].EndTime, time.Local)
  88. var status string
  89. var count int
  90. for _, a := range aiTask {
  91. s, _ := time.ParseInLocation(constants.Layout, a.StartTime, time.Local)
  92. e, _ := time.ParseInLocation(constants.Layout, a.EndTime, time.Local)
  93. if s.Before(start) {
  94. start = s
  95. }
  96. if e.After(end) {
  97. end = e
  98. }
  99. if a.Status == constants.Failed {
  100. status = a.Status
  101. break
  102. }
  103. if a.Status == constants.Pending {
  104. status = a.Status
  105. continue
  106. }
  107. if a.Status == constants.Running {
  108. status = a.Status
  109. continue
  110. }
  111. if a.Status == constants.Completed {
  112. count++
  113. continue
  114. }
  115. }
  116. if count == len(aiTask) {
  117. status = constants.Succeeded
  118. }
  119. if status != "" {
  120. task.Status = status
  121. task.StartTime = start.Format(constants.Layout)
  122. task.EndTime = end.Format(constants.Layout)
  123. }
  124. err = svc.Scheduler.AiStorages.UpdateTask(task)
  125. if err != nil {
  126. return
  127. }
  128. }
  129. func reportStatusMessages(svc *svc.ServiceContext, task *types.TaskModel, aiTask *models.TaskAi) error {
  130. report := &jcs.JobStatusReportReq{
  131. TaskName: task.Name,
  132. TaskID: strconv.FormatInt(task.Id, 10),
  133. Messages: make([]*jcs.ReportMessage, 0),
  134. }
  135. //add report msg
  136. jobMsg := &jcs.ReportMessage{
  137. Status: true,
  138. Message: "",
  139. ClusterID: strconv.FormatInt(aiTask.ClusterId, 10),
  140. Output: aiTask.JobId,
  141. }
  142. report.Messages = append(report.Messages, jobMsg)
  143. _ = jcs.StatusReport(svc.Scheduler.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report)
  144. return nil
  145. }
  146. func updateInferTaskStatus(svc *svc.ServiceContext, task types.TaskModel) {
  147. aiTask, err := svc.Scheduler.AiStorages.GetAiTaskListById(task.Id)
  148. if err != nil {
  149. logx.Errorf(err.Error())
  150. return
  151. }
  152. if len(aiTask) == 0 {
  153. //task.Status = constants.Failed
  154. err = svc.Scheduler.AiStorages.UpdateTask(&task)
  155. if err != nil {
  156. return
  157. }
  158. return
  159. }
  160. if len(aiTask) == 1 {
  161. if aiTask[0].Status == constants.Completed {
  162. task.StartTime = aiTask[0].StartTime
  163. task.EndTime = aiTask[0].EndTime
  164. task.Status = constants.Succeeded
  165. } else {
  166. task.StartTime = aiTask[0].StartTime
  167. task.Status = aiTask[0].Status
  168. }
  169. err = svc.Scheduler.AiStorages.UpdateTask(&task)
  170. if err != nil {
  171. return
  172. }
  173. return
  174. }
  175. //for i := len(aiTask) - 1; i >= 0; i-- {
  176. // if aiTask[i].StartTime == "" {
  177. // task.Status = aiTask[i].Status
  178. // aiTask = append(aiTask[:i], aiTask[i+1:]...)
  179. // }
  180. //}
  181. //
  182. //if len(aiTask) == 0 {
  183. // task.UpdatedTime = time.Now().Format(constants.Layout)
  184. // tx = svc.DbEngin.Table("task").Model(task).Updates(task)
  185. // if tx.Error != nil {
  186. // logx.Errorf(tx.Error.Error())
  187. // return
  188. // }
  189. // return
  190. //}
  191. if aiTask[0].StartTime == "" {
  192. return
  193. }
  194. start, _ := time.ParseInLocation(time.RFC3339, aiTask[0].StartTime, time.Local)
  195. end, _ := time.ParseInLocation(time.RFC3339, aiTask[0].EndTime, time.Local)
  196. var status string
  197. var count int
  198. for _, a := range aiTask {
  199. if a.Status == constants.Failed {
  200. status = a.Status
  201. break
  202. }
  203. if a.Status == constants.Pending {
  204. status = a.Status
  205. continue
  206. }
  207. if a.Status == constants.Running {
  208. status = a.Status
  209. continue
  210. }
  211. if a.Status == constants.Completed {
  212. count++
  213. continue
  214. }
  215. }
  216. if count == len(aiTask) {
  217. status = constants.Succeeded
  218. }
  219. if status == constants.Succeeded {
  220. task.Status = status
  221. task.StartTime = start.Format(time.RFC3339)
  222. task.EndTime = end.Format(time.RFC3339)
  223. } else {
  224. task.Status = status
  225. task.StartTime = start.Format(time.RFC3339)
  226. }
  227. err = svc.Scheduler.AiStorages.UpdateTask(&task)
  228. if err != nil {
  229. return
  230. }
  231. }
  232. func UpdateAiTask(svc *svc.ServiceContext, aiTaskList ...*models.TaskAi) {
  233. var wg sync.WaitGroup
  234. for _, aitask := range aiTaskList {
  235. t := aitask
  236. if t.Status == constants.Completed || t.Status == constants.Failed || t.JobId == "" || t.Status == constants.Cancelled {
  237. continue
  238. }
  239. wg.Add(1)
  240. go func() {
  241. h := http.Request{}
  242. trainingTask, err := svc.Scheduler.AiService.AiCollectorAdapterMap[strconv.FormatInt(t.AdapterId, 10)][strconv.FormatInt(t.ClusterId, 10)].GetTrainingTask(h.Context(), t.JobId)
  243. if err != nil {
  244. if status.Code(err) == codes.DeadlineExceeded {
  245. msg := fmt.Sprintf("###UpdateAiTaskStatus###, AiTaskId: %v, clusterId: %v , JobId: %v, error: %v \n", t.Id, t.ClusterId, t.JobId, err.Error())
  246. logx.Errorf(errors.New(msg).Error())
  247. wg.Done()
  248. return
  249. }
  250. msg := fmt.Sprintf("###UpdateAiTaskStatus###, AiTaskId: %v, clusterId: %v , JobId: %v, error: %v \n", t.Id, t.ClusterId, t.JobId, err.Error())
  251. logx.Errorf(errors.New(msg).Error())
  252. wg.Done()
  253. return
  254. }
  255. if trainingTask == nil {
  256. wg.Done()
  257. return
  258. }
  259. switch trainingTask.Status {
  260. case constants.Running:
  261. if t.Status != trainingTask.Status {
  262. svc.Scheduler.AiStorages.AddNoticeInfo(strconv.FormatInt(t.AdapterId, 10), t.AdapterName, strconv.FormatInt(t.ClusterId, 10), t.ClusterName, t.Name, "running", "任务运行中")
  263. t.Status = trainingTask.Status
  264. }
  265. case constants.Failed:
  266. if t.Status != trainingTask.Status {
  267. svc.Scheduler.AiStorages.AddNoticeInfo(strconv.FormatInt(t.AdapterId, 10), t.AdapterName, strconv.FormatInt(t.ClusterId, 10), t.ClusterName, t.Name, "failed", "任务失败")
  268. t.Status = trainingTask.Status
  269. }
  270. case constants.Completed:
  271. if t.Status != trainingTask.Status {
  272. svc.Scheduler.AiStorages.AddNoticeInfo(strconv.FormatInt(t.AdapterId, 10), t.AdapterName, strconv.FormatInt(t.ClusterId, 10), t.ClusterName, t.Name, "completed", "任务完成")
  273. t.Status = trainingTask.Status
  274. }
  275. default:
  276. if t.Status != trainingTask.Status {
  277. svc.Scheduler.AiStorages.AddNoticeInfo(strconv.FormatInt(t.AdapterId, 10), t.AdapterName, strconv.FormatInt(t.ClusterId, 10), t.ClusterName, t.Name, "pending", "任务pending")
  278. t.Status = trainingTask.Status
  279. }
  280. }
  281. t.StartTime = trainingTask.Start
  282. t.EndTime = trainingTask.End
  283. err = svc.Scheduler.AiStorages.UpdateAiTask(t)
  284. if err != nil {
  285. msg := fmt.Sprintf("###UpdateAiTaskStatus###, AiTaskId: %v, clusterId: %v , JobId: %v, error: %v \n", t.Id, t.ClusterId, t.JobId, err.Error())
  286. logx.Errorf(errors.New(msg).Error())
  287. wg.Done()
  288. return
  289. }
  290. wg.Done()
  291. }()
  292. }
  293. wg.Wait()
  294. }
  295. func UpdateAiTaskStatus(svc *svc.ServiceContext, tasklist []*types.TaskModel) {
  296. list := make([]*types.TaskModel, len(tasklist))
  297. copy(list, tasklist)
  298. for i := len(list) - 1; i >= 0; i-- {
  299. if list[i].AdapterTypeDict != "1" || list[i].Status == constants.Succeeded || list[i].Status == constants.Failed {
  300. list = append(list[:i], list[i+1:]...)
  301. }
  302. }
  303. if len(list) == 0 {
  304. return
  305. }
  306. task := list[0]
  307. for i := range list {
  308. earliest, _ := time.Parse(constants.Layout, task.UpdatedTime)
  309. latest, _ := time.Parse(constants.Layout, list[i].UpdatedTime)
  310. if latest.Before(earliest) {
  311. task = list[i]
  312. }
  313. }
  314. aiTaskList, err := svc.Scheduler.AiStorages.GetAiTaskListById(task.Id)
  315. if err != nil {
  316. logx.Errorf(err.Error())
  317. return
  318. }
  319. if len(aiTaskList) == 0 {
  320. return
  321. }
  322. UpdateAiTask(svc, aiTaskList...)
  323. }
  324. func UpdateTrainingTaskStatus(svc *svc.ServiceContext, list []*types.AdapterInfo) {
  325. var wg sync.WaitGroup
  326. for _, adapter := range list {
  327. taskList, err := svc.Scheduler.AiStorages.GetAiTasksByAdapterId(adapter.Id)
  328. if err != nil {
  329. continue
  330. }
  331. if len(taskList) == 0 {
  332. continue
  333. }
  334. for _, task := range taskList {
  335. t := task
  336. if t.Status == constants.Completed || task.Status == constants.Failed || task.Status == constants.Stopped || task.TaskType != "pytorch" {
  337. continue
  338. }
  339. wg.Add(1)
  340. go func() {
  341. h := http.Request{}
  342. trainingTask, err := svc.Scheduler.AiService.AiCollectorAdapterMap[adapter.Id][strconv.FormatInt(t.ClusterId, 10)].GetTrainingTask(h.Context(), t.JobId)
  343. if err != nil {
  344. msg := fmt.Sprintf("AiTaskId: %v, clusterId: %v , JobId: %v, error: %v \n", t.Id, t.ClusterId, t.JobId, err.Error())
  345. logx.Errorf(errors.New(msg).Error())
  346. wg.Done()
  347. return
  348. }
  349. if trainingTask == nil {
  350. wg.Done()
  351. return
  352. }
  353. t.Status = trainingTask.Status
  354. t.StartTime = trainingTask.Start
  355. t.EndTime = trainingTask.End
  356. err = svc.Scheduler.AiStorages.UpdateAiTask(t)
  357. if err != nil {
  358. wg.Done()
  359. return
  360. }
  361. wg.Done()
  362. }()
  363. }
  364. }
  365. wg.Wait()
  366. return
  367. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.