You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cloudtasksync.go 7.3 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224
  1. package status
  2. import (
  3. "context"
  4. "fmt"
  5. jsoniter "github.com/json-iterator/go"
  6. "github.com/rs/zerolog/log"
  7. "github.com/zeromicro/go-zero/core/logx"
  8. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/jcs"
  9. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
  10. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
  11. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
  12. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
  13. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
  14. "gorm.io/gorm"
  15. "strconv"
  16. "time"
  17. )
  18. func reportCloudStatusMessages(svc *svc.ServiceContext, task *types.TaskModel, cloudTask *models.TaskCloud, status bool, message string) error {
  19. report := &jcs.JobStatusReportReq{}
  20. reportMsg := &jcs.TrainReportMessage{
  21. Type: "Train",
  22. TaskName: task.Name,
  23. TaskID: strconv.FormatInt(task.Id, 10),
  24. Status: status,
  25. Message: message,
  26. ClusterID: strconv.FormatInt(cloudTask.ClusterId, 10),
  27. }
  28. report.Report = reportMsg
  29. marshal, _ := jsoniter.MarshalToString(report)
  30. log.Debug().Msgf("通知中间件任务状态参数: [%v]", marshal)
  31. err := jcs.StatusReport(svc.Scheduler.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report)
  32. if err != nil {
  33. logx.Errorf("############ Report Status Message Error %s", err.Error())
  34. return err
  35. }
  36. return nil
  37. }
  38. // UpdateCloudTaskStatus CLOUD 任务状态同步函数
  39. func UpdateCloudTaskStatus(svc *svc.ServiceContext) {
  40. // 1. 查询需要同步的通算任务
  41. var cloudTaskList []*models.TaskCloud
  42. sqlStr := `SELECT * FROM task_cloud WHERE status NOT IN ('Failed', 'Completed', 'Cancelled') ORDER BY create_time DESC LIMIT 10`
  43. if err := svc.DbEngin.Raw(sqlStr).Scan(&cloudTaskList).Error; err != nil {
  44. logx.Errorf("Failed to query CLOUD tasks for sync: %v", err)
  45. return
  46. }
  47. if len(cloudTaskList) == 0 {
  48. return
  49. }
  50. // 2. 批量获取关联的 Task 模型
  51. taskIDs := make([]int64, len(cloudTaskList))
  52. for i, cloud := range cloudTaskList {
  53. taskIDs[i] = cloud.TaskId
  54. }
  55. taskMap := make(map[int64]*types.TaskModel)
  56. var tasks []*types.TaskModel
  57. if err := svc.DbEngin.Model(&models.Task{}).Where("id IN ?", taskIDs).Find(&tasks).Error; err != nil {
  58. logx.Errorf("Failed to batch query tasks: %v", err)
  59. return
  60. }
  61. for _, task := range tasks {
  62. taskMap[task.Id] = task
  63. }
  64. // 3. 遍历 CLOUD 任务并更新状态
  65. for _, cloud := range cloudTaskList {
  66. task, ok := taskMap[cloud.TaskId]
  67. if !ok {
  68. logx.Errorf("Task with ID %d not found for CLOUD task %d, skipping", cloud.TaskId, cloud.Id)
  69. continue
  70. }
  71. // 使用带超时的 Context,防止 API 调用阻塞
  72. ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
  73. defer cancel()
  74. adapterIDStr := strconv.FormatInt(cloud.AdapterId, 10)
  75. adapter, adapterExists := svc.Scheduler.CloudService.CloudExecutorAdapterMap[adapterIDStr]
  76. if !adapterExists {
  77. logx.Errorf("CLOUD adapter with ID %s not found, skipping task %s", adapterIDStr, cloud.Name)
  78. continue
  79. }
  80. // 4. 从 CLOUD 集群获取最新状态
  81. cloudTaskInfo, err := adapter.GetContainer(ctx, cloud.Name, cloud.BusinessCode, utils.Int64ToString(cloud.ClusterId))
  82. if err != nil {
  83. logx.Errorf("Failed to get task status from CLOUD executor for job %s: %v", cloud.Name, err)
  84. continue // 继续处理下一个任务
  85. }
  86. if cloudTaskInfo.Status == "" {
  87. continue
  88. }
  89. // 如果状态没有变化,则跳过
  90. if cloud.Status == cloudTaskInfo.Status {
  91. continue
  92. }
  93. // 5. 准备更新
  94. startTime := convertUTCTimeToCST(cloudTaskInfo.Start)
  95. previousStatus := cloud.Status
  96. cloud.Status = cloudTaskInfo.Status
  97. cloud.StartTime = startTime
  98. task.Status = cloudTaskInfo.Status
  99. task.StartTime = startTime
  100. task.EndTime = cloudTaskInfo.End
  101. logx.Infof("CLOUD task status change detected for job %s: %s -> %s", cloud.Name, previousStatus, cloud.Status)
  102. // 6. 在事务中更新数据库
  103. err = svc.DbEngin.Transaction(func(tx *gorm.DB) error {
  104. task.UpdatedTime = time.Now().Format(constants.Layout)
  105. if err := tx.Table("task").Updates(task).Error; err != nil {
  106. return fmt.Errorf("failed to update task table: %w", err)
  107. }
  108. if err := tx.Table("task_cloud").Updates(cloud).Error; err != nil {
  109. return fmt.Errorf("failed to update cloud_task table: %w", err)
  110. }
  111. return nil
  112. })
  113. if err != nil {
  114. logx.Errorf("Failed to update database in transaction for job %s: %v", cloud.Name, err)
  115. // 事务失败,回滚状态,继续处理下一个任务
  116. cloud.Status = previousStatus
  117. task.Status = previousStatus
  118. continue
  119. }
  120. // 7. 根据新状态执行后续操作 (通知、报告等)
  121. handleNoticeChange(svc, task, cloud, cloudTaskInfo.Status)
  122. }
  123. }
  124. // handleStatusChange 根据新状态执行后续操作
  125. func handleNoticeChange(svc *svc.ServiceContext, task *types.TaskModel, cloud *models.TaskCloud, newStatus string) {
  126. adapterIDStr := strconv.FormatInt(cloud.AdapterId, 10)
  127. clusterIDStr := strconv.FormatInt(cloud.ClusterId, 10)
  128. var noticeType, noticeMessage string
  129. var reportSuccess bool
  130. var shouldReport bool
  131. switch newStatus {
  132. case constants.Running:
  133. noticeType = "running"
  134. noticeMessage = "任务运行中"
  135. case constants.Failed:
  136. noticeType = "failed"
  137. noticeMessage = "任务失败"
  138. reportSuccess = false
  139. shouldReport = true
  140. case constants.Completed:
  141. noticeType = "completed"
  142. noticeMessage = "任务完成"
  143. reportSuccess = true
  144. shouldReport = true
  145. case constants.Pending:
  146. noticeType = "pending"
  147. noticeMessage = "任务pending"
  148. default:
  149. // 对于其他未知状态,可以选择记录日志并返回
  150. logx.Errorf("Unhandled CLOUD task status '%s' for job %s", newStatus, cloud.Name)
  151. return
  152. }
  153. // 发送通知
  154. svc.Scheduler.CloudStorages.AddNoticeInfo(adapterIDStr, cloud.AdapterName, clusterIDStr, cloud.ClusterName, cloud.Name, noticeType, noticeMessage)
  155. logx.Infof("[%s]: 任务状态变更为 [%s],发送通知。", cloud.Name, newStatus)
  156. // 上报状态
  157. if shouldReport {
  158. if err := reportCloudStatusMessages(svc, task, cloud, reportSuccess, noticeMessage); err != nil {
  159. logx.Errorf("Failed to report Cloud status for job %s: %v", cloud.Name, err)
  160. }
  161. }
  162. }
  163. func convertUTCTimeToCST(utcTimeStr string) string {
  164. if utcTimeStr == "" {
  165. return ""
  166. }
  167. // 定义多种可能的时间格式
  168. timeFormats := []string{
  169. "2006-01-02T15:04:05Z", // ISO 8601 格式
  170. "2006-01-02 15:04:05", // 常见格式
  171. "2006-01-02T15:04:05", // ISO 无时区
  172. "2006-01-02 15:04:05Z", // 带Z的常见格式
  173. time.RFC3339, // RFC3339 标准格式
  174. "2006-01-02T15:04:05.000Z", // 带毫秒的ISO格式
  175. }
  176. var utcTime time.Time
  177. var err error
  178. // 尝试解析多种格式
  179. for _, format := range timeFormats {
  180. utcTime, err = time.Parse(format, utcTimeStr)
  181. if err == nil {
  182. break
  183. }
  184. }
  185. // 如果所有格式都失败,记录警告并返回原字符串
  186. if err != nil {
  187. logx.Errorf("Failed to parse time string '%s' with all known formats, returning original string", utcTimeStr)
  188. return utcTimeStr
  189. }
  190. // 创建东八区时区
  191. cstZone := time.FixedZone("CST", 8*3600)
  192. // 将UTC时间转换为东八区时间
  193. cstTime := utcTime.In(cstZone)
  194. // 格式化东八区时间
  195. return cstTime.Format("2006-01-02T15:04:05-07:00")
  196. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.