You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

taskStatusSync.go 8.4 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337
  1. package status
  2. import (
  3. "errors"
  4. "fmt"
  5. "github.com/zeromicro/go-zero/core/logx"
  6. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
  7. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
  8. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
  9. "google.golang.org/grpc/codes"
  10. "google.golang.org/grpc/status"
  11. "net/http"
  12. "strconv"
  13. "sync"
  14. "time"
  15. )
  16. func UpdateTaskStatus(svc *svc.ServiceContext, tasklist []*types.TaskModel) {
  17. list := make([]*types.TaskModel, len(tasklist))
  18. copy(list, tasklist)
  19. for i := len(list) - 1; i >= 0; i-- {
  20. if list[i].AdapterTypeDict != "1" || list[i].Status == constants.Succeeded || list[i].Status == constants.Failed {
  21. list = append(list[:i], list[i+1:]...)
  22. }
  23. }
  24. if len(list) == 0 {
  25. return
  26. }
  27. task := list[0]
  28. for i := range list {
  29. earliest, _ := time.Parse(time.RFC3339, task.UpdatedTime)
  30. latest, _ := time.Parse(time.RFC3339, list[i].UpdatedTime)
  31. if latest.Before(earliest) {
  32. task = list[i]
  33. }
  34. }
  35. // Update Infer Task Status
  36. if task.TaskTypeDict == "11" || task.TaskTypeDict == "12" {
  37. updateInferTaskStatus(svc, *task)
  38. return
  39. }
  40. aiTask, err := svc.Scheduler.AiStorages.GetAiTaskListById(task.Id)
  41. if err != nil {
  42. logx.Errorf(err.Error())
  43. return
  44. }
  45. if len(aiTask) == 0 {
  46. err := svc.Scheduler.AiStorages.UpdateTask(task)
  47. if err != nil {
  48. return
  49. }
  50. return
  51. }
  52. if len(aiTask) == 1 {
  53. if aiTask[0].Status == constants.Completed {
  54. task.Status = constants.Succeeded
  55. } else {
  56. task.Status = aiTask[0].Status
  57. }
  58. task.StartTime = aiTask[0].StartTime
  59. task.EndTime = aiTask[0].EndTime
  60. err := svc.Scheduler.AiStorages.UpdateTask(task)
  61. if err != nil {
  62. return
  63. }
  64. return
  65. }
  66. for i := len(aiTask) - 1; i >= 0; i-- {
  67. if aiTask[i].StartTime == "" {
  68. task.Status = aiTask[i].Status
  69. aiTask = append(aiTask[:i], aiTask[i+1:]...)
  70. }
  71. }
  72. if len(aiTask) == 0 {
  73. err := svc.Scheduler.AiStorages.UpdateTask(task)
  74. if err != nil {
  75. return
  76. }
  77. return
  78. }
  79. start, _ := time.ParseInLocation(constants.Layout, aiTask[0].StartTime, time.Local)
  80. end, _ := time.ParseInLocation(constants.Layout, aiTask[0].EndTime, time.Local)
  81. var status string
  82. var count int
  83. for _, a := range aiTask {
  84. s, _ := time.ParseInLocation(constants.Layout, a.StartTime, time.Local)
  85. e, _ := time.ParseInLocation(constants.Layout, a.EndTime, time.Local)
  86. if s.Before(start) {
  87. start = s
  88. }
  89. if e.After(end) {
  90. end = e
  91. }
  92. if a.Status == constants.Failed {
  93. status = a.Status
  94. break
  95. }
  96. if a.Status == constants.Pending {
  97. status = a.Status
  98. continue
  99. }
  100. if a.Status == constants.Running {
  101. status = a.Status
  102. continue
  103. }
  104. if a.Status == constants.Completed {
  105. count++
  106. continue
  107. }
  108. }
  109. if count == len(aiTask) {
  110. status = constants.Succeeded
  111. }
  112. if status != "" {
  113. task.Status = status
  114. task.StartTime = start.Format(constants.Layout)
  115. task.EndTime = end.Format(constants.Layout)
  116. }
  117. err = svc.Scheduler.AiStorages.UpdateTask(task)
  118. if err != nil {
  119. return
  120. }
  121. }
  122. func updateInferTaskStatus(svc *svc.ServiceContext, task types.TaskModel) {
  123. aiTask, err := svc.Scheduler.AiStorages.GetAiTaskListById(task.Id)
  124. if err != nil {
  125. logx.Errorf(err.Error())
  126. return
  127. }
  128. if len(aiTask) == 0 {
  129. task.Status = constants.Failed
  130. err = svc.Scheduler.AiStorages.UpdateTask(&task)
  131. if err != nil {
  132. return
  133. }
  134. return
  135. }
  136. if len(aiTask) == 1 {
  137. if aiTask[0].Status == constants.Completed {
  138. task.StartTime = aiTask[0].StartTime
  139. task.EndTime = aiTask[0].EndTime
  140. task.Status = constants.Succeeded
  141. } else {
  142. task.StartTime = aiTask[0].StartTime
  143. task.Status = aiTask[0].Status
  144. }
  145. err = svc.Scheduler.AiStorages.UpdateTask(&task)
  146. if err != nil {
  147. return
  148. }
  149. return
  150. }
  151. //for i := len(aiTask) - 1; i >= 0; i-- {
  152. // if aiTask[i].StartTime == "" {
  153. // task.Status = aiTask[i].Status
  154. // aiTask = append(aiTask[:i], aiTask[i+1:]...)
  155. // }
  156. //}
  157. //
  158. //if len(aiTask) == 0 {
  159. // task.UpdatedTime = time.Now().Format(constants.Layout)
  160. // tx = svc.DbEngin.Table("task").Model(task).Updates(task)
  161. // if tx.Error != nil {
  162. // logx.Errorf(tx.Error.Error())
  163. // return
  164. // }
  165. // return
  166. //}
  167. if aiTask[0].StartTime == "" {
  168. return
  169. }
  170. start, _ := time.ParseInLocation(time.RFC3339, aiTask[0].StartTime, time.Local)
  171. end, _ := time.ParseInLocation(time.RFC3339, aiTask[0].EndTime, time.Local)
  172. var status string
  173. var count int
  174. for _, a := range aiTask {
  175. if a.Status == constants.Failed {
  176. status = a.Status
  177. break
  178. }
  179. if a.Status == constants.Pending {
  180. status = a.Status
  181. continue
  182. }
  183. if a.Status == constants.Running {
  184. status = a.Status
  185. continue
  186. }
  187. if a.Status == constants.Completed {
  188. count++
  189. continue
  190. }
  191. }
  192. if count == len(aiTask) {
  193. status = constants.Succeeded
  194. }
  195. if status == constants.Succeeded {
  196. task.Status = status
  197. task.StartTime = start.Format(time.RFC3339)
  198. task.EndTime = end.Format(time.RFC3339)
  199. } else {
  200. task.Status = status
  201. task.StartTime = start.Format(time.RFC3339)
  202. }
  203. err = svc.Scheduler.AiStorages.UpdateTask(&task)
  204. if err != nil {
  205. return
  206. }
  207. }
  208. func UpdateAiTaskStatus(svc *svc.ServiceContext, tasklist []*types.TaskModel) {
  209. list := make([]*types.TaskModel, len(tasklist))
  210. copy(list, tasklist)
  211. for i := len(list) - 1; i >= 0; i-- {
  212. if list[i].AdapterTypeDict != "1" || list[i].Status == constants.Succeeded || list[i].Status == constants.Failed {
  213. list = append(list[:i], list[i+1:]...)
  214. }
  215. }
  216. if len(list) == 0 {
  217. return
  218. }
  219. task := list[0]
  220. for i := range list {
  221. earliest, _ := time.Parse(constants.Layout, task.UpdatedTime)
  222. latest, _ := time.Parse(constants.Layout, list[i].UpdatedTime)
  223. if latest.Before(earliest) {
  224. task = list[i]
  225. }
  226. }
  227. aiTaskList, err := svc.Scheduler.AiStorages.GetAiTaskListById(task.Id)
  228. if err != nil {
  229. logx.Errorf(err.Error())
  230. return
  231. }
  232. if len(aiTaskList) == 0 {
  233. return
  234. }
  235. var wg sync.WaitGroup
  236. for _, aitask := range aiTaskList {
  237. t := aitask
  238. if t.Status == constants.Completed || t.Status == constants.Failed || t.JobId == "" {
  239. continue
  240. }
  241. wg.Add(1)
  242. go func() {
  243. h := http.Request{}
  244. trainingTask, err := svc.Scheduler.AiService.AiCollectorAdapterMap[strconv.FormatInt(t.AdapterId, 10)][strconv.FormatInt(t.ClusterId, 10)].GetTrainingTask(h.Context(), t.JobId)
  245. if err != nil {
  246. if status.Code(err) == codes.DeadlineExceeded {
  247. msg := fmt.Sprintf("###UpdateAiTaskStatus###, AiTaskId: %v, clusterId: %v , JobId: %v, error: %v \n", t.Id, t.ClusterId, t.JobId, err.Error())
  248. logx.Errorf(errors.New(msg).Error())
  249. wg.Done()
  250. return
  251. }
  252. msg := fmt.Sprintf("###UpdateAiTaskStatus###, AiTaskId: %v, clusterId: %v , JobId: %v, error: %v \n", t.Id, t.ClusterId, t.JobId, err.Error())
  253. logx.Errorf(errors.New(msg).Error())
  254. wg.Done()
  255. return
  256. }
  257. if trainingTask == nil {
  258. wg.Done()
  259. return
  260. }
  261. switch trainingTask.Status {
  262. case constants.Running:
  263. if t.Status != trainingTask.Status {
  264. svc.Scheduler.AiStorages.AddNoticeInfo(strconv.FormatInt(t.AdapterId, 10), t.AdapterName, strconv.FormatInt(t.ClusterId, 10), t.ClusterName, t.Name, "running", "任务运行中")
  265. t.Status = trainingTask.Status
  266. }
  267. case constants.Failed:
  268. if t.Status != trainingTask.Status {
  269. svc.Scheduler.AiStorages.AddNoticeInfo(strconv.FormatInt(t.AdapterId, 10), t.AdapterName, strconv.FormatInt(t.ClusterId, 10), t.ClusterName, t.Name, "failed", "任务失败")
  270. t.Status = trainingTask.Status
  271. }
  272. case constants.Completed:
  273. if t.Status != trainingTask.Status {
  274. svc.Scheduler.AiStorages.AddNoticeInfo(strconv.FormatInt(t.AdapterId, 10), t.AdapterName, strconv.FormatInt(t.ClusterId, 10), t.ClusterName, t.Name, "completed", "任务完成")
  275. t.Status = trainingTask.Status
  276. }
  277. default:
  278. if t.Status != trainingTask.Status {
  279. svc.Scheduler.AiStorages.AddNoticeInfo(strconv.FormatInt(t.AdapterId, 10), t.AdapterName, strconv.FormatInt(t.ClusterId, 10), t.ClusterName, t.Name, "pending", "任务pending")
  280. t.Status = trainingTask.Status
  281. }
  282. }
  283. t.StartTime = trainingTask.Start
  284. t.EndTime = trainingTask.End
  285. err = svc.Scheduler.AiStorages.UpdateAiTask(t)
  286. if err != nil {
  287. msg := fmt.Sprintf("###UpdateAiTaskStatus###, AiTaskId: %v, clusterId: %v , JobId: %v, error: %v \n", t.Id, t.ClusterId, t.JobId, err.Error())
  288. logx.Errorf(errors.New(msg).Error())
  289. wg.Done()
  290. return
  291. }
  292. wg.Done()
  293. }()
  294. }
  295. wg.Wait()
  296. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.