Browse Source

Merge pull request 'fix reportstatus bugs' (#475) from tzwang/pcm-coordinator:master into master

pull/478/head
tzwang 6 months ago
parent
commit
7babb44c0b
5 changed files with 29 additions and 19 deletions
  1. +6
    -3
      internal/scheduler/schedulers/aiScheduler.go
  2. +2
    -4
      internal/scheduler/service/utils/jcs/middleware.go
  3. +3
    -1
      internal/scheduler/service/utils/status/hpc_task_sync.go
  4. +11
    -6
      internal/scheduler/service/utils/status/statusSync.go
  5. +7
    -5
      internal/scheduler/service/utils/status/taskStatusSync.go

+ 6
- 3
internal/scheduler/schedulers/aiScheduler.go View File

@@ -278,7 +278,8 @@ func (as *AiScheduler) handleErrors(errs []interface{}, clusters []*strategy.Ass
}

//report msg
report := &jcs.TrainReportMessage{
report := &jcs.JobStatusReportReq{}
reportMsg := &jcs.TrainReportMessage{
Type: "Train",
TaskName: "",
TaskID: strconv.FormatInt(taskId, 10),
@@ -287,7 +288,7 @@ func (as *AiScheduler) handleErrors(errs []interface{}, clusters []*strategy.Ass
ClusterID: e.clusterId,
Output: "",
}
report.Report = reportMsg
//report status
_ = jcs.StatusReport(as.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report)

@@ -315,7 +316,8 @@ func (as *AiScheduler) handleErrors(errs []interface{}, clusters []*strategy.Ass
}
}
//add report msg
report := &jcs.TrainReportMessage{
report := &jcs.JobStatusReportReq{}
reportMsg := &jcs.TrainReportMessage{
Type: "Train",
TaskName: "",
TaskID: strconv.FormatInt(taskId, 10),
@@ -324,6 +326,7 @@ func (as *AiScheduler) handleErrors(errs []interface{}, clusters []*strategy.Ass
ClusterID: s.ClusterId,
Output: "",
}
report.Report = reportMsg
//report status
_ = jcs.StatusReport(as.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report)
}


+ 2
- 4
internal/scheduler/service/utils/jcs/middleware.go View File

@@ -10,9 +10,7 @@ import (
)

type JobStatusReportReq struct {
TaskName string `json:"taskName"`
TaskID string `json:"taskID"`
Messages []interface{} `json:"messages"`
Report interface{} `json:"report"`
}
type TrainReportMessage struct {
Type string `json:"type"`
@@ -44,7 +42,7 @@ func StatusReport(url string, report interface{}) error {
rp, err := req.
SetHeader("Content-Type", "application/json").
SetBody(report).
SetResult(&resp).
SetResult(resp).
Post(url)

if err != nil {


+ 3
- 1
internal/scheduler/service/utils/status/hpc_task_sync.go View File

@@ -14,7 +14,8 @@ import (
)

func reportHpcStatusMessages(svc *svc.ServiceContext, task *types.TaskModel, hpcTask *models.TaskHpc, status bool, message string) error {
report := &jcs.TrainReportMessage{
report := &jcs.JobStatusReportReq{}
reportMsg := &jcs.TrainReportMessage{
Type: "Train",
TaskName: task.Name,
TaskID: strconv.FormatInt(task.Id, 10),
@@ -23,6 +24,7 @@ func reportHpcStatusMessages(svc *svc.ServiceContext, task *types.TaskModel, hpc
ClusterID: strconv.FormatInt(hpcTask.ClusterId, 10),
Output: hpcTask.WorkDir,
}
report.Report = reportMsg

marshal, _ := jsoniter.MarshalToString(report)
log.Debug().Msgf("通知中间件任务状态参数: [%v]", marshal)


+ 11
- 6
internal/scheduler/service/utils/status/statusSync.go View File

@@ -192,7 +192,8 @@ func (s *TaskStatus) updateAiTask(aiTaskList []*models.TaskAi) {
}

func (s *TaskStatus) reportStatusMessages(task *types.TaskModel, aiTask *models.TaskAi) error {
report := &jcs.TrainReportMessage{
report := &jcs.JobStatusReportReq{}
reportMsg := &jcs.TrainReportMessage{
Type: "Train",
TaskName: task.Name,
TaskID: strconv.FormatInt(task.Id, 10),
@@ -206,10 +207,12 @@ func (s *TaskStatus) reportStatusMessages(task *types.TaskModel, aiTask *models.
output = aiTask.Output
}

report.Status = true
report.Message = ""
report.ClusterID = strconv.FormatInt(aiTask.ClusterId, 10)
report.Output = output
reportMsg.Status = true
reportMsg.Message = ""
reportMsg.ClusterID = strconv.FormatInt(aiTask.ClusterId, 10)
reportMsg.Output = output

report.Report = reportMsg

err := jcs.StatusReport(s.config.JcsMiddleware.JobStatusReportUrl, report)
if err != nil {
@@ -224,7 +227,8 @@ func (s *TaskStatus) reportStatusMessages(task *types.TaskModel, aiTask *models.
}

func ReportStatus(svc *svc.ServiceContext, taskName string, taskId string, clusterId string, url string, status bool, msg string) error {
report := &jcs.InferReportMessage{
report := &jcs.JobStatusReportReq{}
reportMsg := &jcs.InferReportMessage{
Type: "Inference",
TaskName: taskName,
TaskID: taskId,
@@ -233,6 +237,7 @@ func ReportStatus(svc *svc.ServiceContext, taskName string, taskId string, clust
ClusterID: clusterId,
Url: url,
}
report.Report = reportMsg

err := jcs.StatusReport(svc.Scheduler.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report)
if err != nil {


+ 7
- 5
internal/scheduler/service/utils/status/taskStatusSync.go View File

@@ -166,7 +166,8 @@ func UpdateTaskStatus(svc *svc.ServiceContext, tasklist []*types.TaskModel) {
}

func reportStatusMessages(svc *svc.ServiceContext, task *types.TaskModel, aiTask *models.TaskAi) error {
report := &jcs.TrainReportMessage{
report := &jcs.JobStatusReportReq{}
reportMsg := &jcs.TrainReportMessage{
Type: "Train",
TaskName: task.Name,
TaskID: strconv.FormatInt(task.Id, 10),
@@ -180,10 +181,11 @@ func reportStatusMessages(svc *svc.ServiceContext, task *types.TaskModel, aiTask
output = aiTask.Output
}

report.Status = true
report.Message = ""
report.ClusterID = strconv.FormatInt(aiTask.ClusterId, 10)
report.Output = output
reportMsg.Status = true
reportMsg.Message = ""
reportMsg.ClusterID = strconv.FormatInt(aiTask.ClusterId, 10)
reportMsg.Output = output
report.Report = reportMsg

err := jcs.StatusReport(svc.Scheduler.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report)
if err != nil {


Loading…
Cancel
Save