| @@ -7,6 +7,7 @@ import ( | |||||
| "fmt" | "fmt" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/entity" | "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/entity" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option" | "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/status" | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/task" | "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/task" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy" | "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy" | ||||
| "strconv" | "strconv" | ||||
| @@ -78,6 +79,10 @@ func (l *CreateInferenceTaskLogic) CreateInferenceTask(req *types.CreateInferenc | |||||
| err = l.createInferenceTask(taskId, adapterClusterMap, opt) | err = l.createInferenceTask(taskId, adapterClusterMap, opt) | ||||
| if err != nil { | if err != nil { | ||||
| if len(assignedClusters) != 0 { | |||||
| _ = status.ReportStatus(l.svcCtx, taskName, strconv.FormatInt(taskId, 10), assignedClusters[0].ClusterId, "", false, "") | |||||
| logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error()) | |||||
| } | |||||
| return nil, err | return nil, err | ||||
| } | } | ||||
| @@ -261,13 +261,6 @@ func (as *AiScheduler) handleErrors(errs []interface{}, clusters []*strategy.Ass | |||||
| return err | return err | ||||
| } | } | ||||
| //report msg | |||||
| report := &jcs.JobStatusReportReq{ | |||||
| TaskName: "", | |||||
| TaskID: strconv.FormatInt(taskId, 10), | |||||
| Messages: make([]*jcs.ReportMessage, 0), | |||||
| } | |||||
| var errmsg string | var errmsg string | ||||
| for _, err := range errs { | for _, err := range errs { | ||||
| e := (err).(struct { | e := (err).(struct { | ||||
| @@ -284,14 +277,23 @@ func (as *AiScheduler) handleErrors(errs []interface{}, clusters []*strategy.Ass | |||||
| return errors.New("database add failed: " + err.Error()) | return errors.New("database add failed: " + err.Error()) | ||||
| } | } | ||||
| //add report msg | |||||
| jobMsg := &jcs.ReportMessage{ | |||||
| //report msg | |||||
| report := &jcs.TrainReportMessage{ | |||||
| Type: "Train", | |||||
| TaskName: "", | |||||
| TaskID: strconv.FormatInt(taskId, 10), | |||||
| Status: false, | Status: false, | ||||
| Message: msg, | Message: msg, | ||||
| ClusterID: e.clusterId, | ClusterID: e.clusterId, | ||||
| Output: "", | Output: "", | ||||
| } | } | ||||
| report.Messages = append(report.Messages, jobMsg) | |||||
| //report status | |||||
| _ = jcs.StatusReport(as.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report) | |||||
| logx.Errorf(errors.New(errmsg).Error()) | |||||
| return errors.New(errmsg) | |||||
| } | } | ||||
| for _, s := range results { | for _, s := range results { | ||||
| as.option.ComputeCard = s.Card //execute card | as.option.ComputeCard = s.Card //execute card | ||||
| @@ -313,18 +315,19 @@ func (as *AiScheduler) handleErrors(errs []interface{}, clusters []*strategy.Ass | |||||
| } | } | ||||
| } | } | ||||
| //add report msg | //add report msg | ||||
| jobMsg := &jcs.ReportMessage{ | |||||
| report := &jcs.TrainReportMessage{ | |||||
| Type: "Train", | |||||
| TaskName: "", | |||||
| TaskID: strconv.FormatInt(taskId, 10), | |||||
| Status: false, | Status: false, | ||||
| Message: s.Msg, | Message: s.Msg, | ||||
| ClusterID: s.ClusterId, | ClusterID: s.ClusterId, | ||||
| Output: "", | Output: "", | ||||
| } | } | ||||
| report.Messages = append(report.Messages, jobMsg) | |||||
| //report status | |||||
| _ = jcs.StatusReport(as.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report) | |||||
| } | } | ||||
| //report status | |||||
| _ = jcs.StatusReport(as.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report) | |||||
| logx.Errorf(errors.New(errmsg).Error()) | logx.Errorf(errors.New(errmsg).Error()) | ||||
| return errors.New(errmsg) | return errors.New(errmsg) | ||||
| } | } | ||||
| @@ -10,18 +10,30 @@ import ( | |||||
| ) | ) | ||||
| type JobStatusReportReq struct { | type JobStatusReportReq struct { | ||||
| TaskName string `json:"taskName"` | |||||
| TaskID string `json:"taskID"` | |||||
| Messages []*ReportMessage `json:"messages"` | |||||
| TaskName string `json:"taskName"` | |||||
| TaskID string `json:"taskID"` | |||||
| Messages []interface{} `json:"messages"` | |||||
| } | } | ||||
| type ReportMessage struct { | |||||
| type TrainReportMessage struct { | |||||
| Type string `json:"type"` | |||||
| TaskName string `json:"taskName"` | |||||
| TaskID string `json:"taskID"` | |||||
| Status bool `json:"status"` | Status bool `json:"status"` | ||||
| Message string `json:"message"` | Message string `json:"message"` | ||||
| ClusterID string `json:"clusterID"` | ClusterID string `json:"clusterID"` | ||||
| Output string `json:"output"` | Output string `json:"output"` | ||||
| } | } | ||||
| type InferReportMessage struct { | |||||
| Type string `json:"type"` | |||||
| TaskName string `json:"taskName"` | |||||
| TaskID string `json:"taskID"` | |||||
| Status bool `json:"status"` | |||||
| Message string `json:"message"` | |||||
| ClusterID string `json:"clusterID"` | |||||
| Url string `json:"url"` | |||||
| } | |||||
| func StatusReport(url string, report *JobStatusReportReq) error { | |||||
| func StatusReport(url string, report interface{}) error { | |||||
| resp := struct { | resp := struct { | ||||
| Code string `json:"code"` | Code string `json:"code"` | ||||
| Msg string `json:"message"` | Msg string `json:"message"` | ||||
| @@ -49,7 +61,7 @@ func StatusReport(url string, report *JobStatusReportReq) error { | |||||
| return nil | return nil | ||||
| } | } | ||||
| func TempSaveReportToTask(store *database.AiStorage, task *types.TaskModel, report *JobStatusReportReq) error { | |||||
| func TempSaveReportToTask(store *database.AiStorage, task *types.TaskModel, report interface{}) error { | |||||
| jsonBytes, err := json.Marshal(report) | jsonBytes, err := json.Marshal(report) | ||||
| task.Result = string(jsonBytes) | task.Result = string(jsonBytes) | ||||
| @@ -1,6 +1,7 @@ | |||||
| package status | package status | ||||
| import ( | import ( | ||||
| "github.com/zeromicro/go-zero/core/logx" | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference" | "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/storeLink" | "gitlink.org.cn/JointCloud/pcm-coordinator/internal/storeLink" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" | "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" | ||||
| @@ -97,6 +98,11 @@ func UpdateDeployInstanceStatus(svc *svc.ServiceContext, instance *models.AiInfe | |||||
| } | } | ||||
| return | return | ||||
| } | } | ||||
| url := ins.InferUrl | |||||
| err := ReportStatus(svc, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), url, true, "") | |||||
| if err != nil { | |||||
| logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error()) | |||||
| } | |||||
| instance.Status = constants.Running | instance.Status = constants.Running | ||||
| case "stopped": | case "stopped": | ||||
| if instance.Status == constants.Stopped { | if instance.Status == constants.Stopped { | ||||
| @@ -120,6 +126,11 @@ func UpdateDeployInstanceStatus(svc *svc.ServiceContext, instance *models.AiInfe | |||||
| } | } | ||||
| return | return | ||||
| } | } | ||||
| url := ins.InferUrl | |||||
| err := ReportStatus(svc, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), url, true, "") | |||||
| if err != nil { | |||||
| logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error()) | |||||
| } | |||||
| instance.Status = constants.Running | instance.Status = constants.Running | ||||
| case "stopped": | case "stopped": | ||||
| if instance.Status == constants.Stopped { | if instance.Status == constants.Stopped { | ||||
| @@ -130,6 +141,19 @@ func UpdateDeployInstanceStatus(svc *svc.ServiceContext, instance *models.AiInfe | |||||
| return | return | ||||
| } | } | ||||
| instance.Status = constants.Stopped | instance.Status = constants.Stopped | ||||
| case "failed": | |||||
| if instance.Status == constants.Failed { | |||||
| if ch != nil { | |||||
| <-ch | |||||
| return | |||||
| } | |||||
| return | |||||
| } | |||||
| err := ReportStatus(svc, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), "", false, ins.Status) | |||||
| if err != nil { | |||||
| logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error()) | |||||
| } | |||||
| instance.Status = constants.Failed | |||||
| default: | default: | ||||
| instance.Status = ins.Status | instance.Status = ins.Status | ||||
| } | } | ||||
| @@ -166,6 +190,11 @@ func UpdateDeployInstanceStatus(svc *svc.ServiceContext, instance *models.AiInfe | |||||
| } | } | ||||
| return | return | ||||
| } | } | ||||
| url := ins.InferUrl | |||||
| err := ReportStatus(svc, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), url, true, "") | |||||
| if err != nil { | |||||
| logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error()) | |||||
| } | |||||
| instance.Status = constants.Running | instance.Status = constants.Running | ||||
| case "STOPPED": | case "STOPPED": | ||||
| if instance.Status == constants.Stopped { | if instance.Status == constants.Stopped { | ||||
| @@ -184,6 +213,10 @@ func UpdateDeployInstanceStatus(svc *svc.ServiceContext, instance *models.AiInfe | |||||
| } | } | ||||
| return | return | ||||
| } | } | ||||
| err := ReportStatus(svc, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), "", false, ins.Status) | |||||
| if err != nil { | |||||
| logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error()) | |||||
| } | |||||
| instance.Status = constants.Failed | instance.Status = constants.Failed | ||||
| case "FAILED": | case "FAILED": | ||||
| if instance.Status == constants.Failed { | if instance.Status == constants.Failed { | ||||
| @@ -193,6 +226,10 @@ func UpdateDeployInstanceStatus(svc *svc.ServiceContext, instance *models.AiInfe | |||||
| } | } | ||||
| return | return | ||||
| } | } | ||||
| err := ReportStatus(svc, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), "", false, ins.Status) | |||||
| if err != nil { | |||||
| logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error()) | |||||
| } | |||||
| instance.Status = constants.Failed | instance.Status = constants.Failed | ||||
| default: | default: | ||||
| instance.Status = ins.Status | instance.Status = ins.Status | ||||
| @@ -14,19 +14,16 @@ import ( | |||||
| ) | ) | ||||
| func reportHpcStatusMessages(svc *svc.ServiceContext, task *types.TaskModel, hpcTask *models.TaskHpc, status bool, message string) error { | func reportHpcStatusMessages(svc *svc.ServiceContext, task *types.TaskModel, hpcTask *models.TaskHpc, status bool, message string) error { | ||||
| report := &jcs.JobStatusReportReq{ | |||||
| TaskName: task.Name, | |||||
| TaskID: strconv.FormatInt(task.Id, 10), | |||||
| Messages: make([]*jcs.ReportMessage, 0), | |||||
| } | |||||
| jobMsg := &jcs.ReportMessage{ | |||||
| report := &jcs.TrainReportMessage{ | |||||
| Type: "Train", | |||||
| TaskName: task.Name, | |||||
| TaskID: strconv.FormatInt(task.Id, 10), | |||||
| Status: status, | Status: status, | ||||
| Message: message, | Message: message, | ||||
| ClusterID: strconv.FormatInt(hpcTask.ClusterId, 10), | ClusterID: strconv.FormatInt(hpcTask.ClusterId, 10), | ||||
| Output: hpcTask.WorkDir, | Output: hpcTask.WorkDir, | ||||
| } | } | ||||
| report.Messages = append(report.Messages, jobMsg) | |||||
| marshal, _ := jsoniter.MarshalToString(report) | marshal, _ := jsoniter.MarshalToString(report) | ||||
| log.Debug().Msgf("通知中间件任务状态参数: [%v]", marshal) | log.Debug().Msgf("通知中间件任务状态参数: [%v]", marshal) | ||||
| err := jcs.StatusReport(svc.Scheduler.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report) | err := jcs.StatusReport(svc.Scheduler.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report) | ||||
| @@ -8,6 +8,7 @@ import ( | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/database" | "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/database" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector" | "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/jcs" | "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/jcs" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types" | "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" | "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" | "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" | ||||
| @@ -191,12 +192,12 @@ func (s *TaskStatus) updateAiTask(aiTaskList []*models.TaskAi) { | |||||
| } | } | ||||
| func (s *TaskStatus) reportStatusMessages(task *types.TaskModel, aiTask *models.TaskAi) error { | func (s *TaskStatus) reportStatusMessages(task *types.TaskModel, aiTask *models.TaskAi) error { | ||||
| report := &jcs.JobStatusReportReq{ | |||||
| report := &jcs.TrainReportMessage{ | |||||
| Type: "Train", | |||||
| TaskName: task.Name, | TaskName: task.Name, | ||||
| TaskID: strconv.FormatInt(task.Id, 10), | TaskID: strconv.FormatInt(task.Id, 10), | ||||
| Messages: make([]*jcs.ReportMessage, 0), | |||||
| } | } | ||||
| //add report msg | |||||
| var output string | var output string | ||||
| switch aiTask.ClusterName { | switch aiTask.ClusterName { | ||||
| case "openI": | case "openI": | ||||
| @@ -205,13 +206,10 @@ func (s *TaskStatus) reportStatusMessages(task *types.TaskModel, aiTask *models. | |||||
| output = aiTask.Output | output = aiTask.Output | ||||
| } | } | ||||
| jobMsg := &jcs.ReportMessage{ | |||||
| Status: true, | |||||
| Message: "", | |||||
| ClusterID: strconv.FormatInt(aiTask.ClusterId, 10), | |||||
| Output: output, | |||||
| } | |||||
| report.Messages = append(report.Messages, jobMsg) | |||||
| report.Status = true | |||||
| report.Message = "" | |||||
| report.ClusterID = strconv.FormatInt(aiTask.ClusterId, 10) | |||||
| report.Output = output | |||||
| err := jcs.StatusReport(s.config.JcsMiddleware.JobStatusReportUrl, report) | err := jcs.StatusReport(s.config.JcsMiddleware.JobStatusReportUrl, report) | ||||
| if err != nil { | if err != nil { | ||||
| @@ -224,3 +222,21 @@ func (s *TaskStatus) reportStatusMessages(task *types.TaskModel, aiTask *models. | |||||
| } | } | ||||
| return nil | return nil | ||||
| } | } | ||||
| func ReportStatus(svc *svc.ServiceContext, taskName string, taskId string, clusterId string, url string, status bool, msg string) error { | |||||
| report := &jcs.InferReportMessage{ | |||||
| Type: "Inference", | |||||
| TaskName: taskName, | |||||
| TaskID: taskId, | |||||
| Status: status, | |||||
| Message: msg, | |||||
| ClusterID: clusterId, | |||||
| Url: url, | |||||
| } | |||||
| err := jcs.StatusReport(svc.Scheduler.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| return nil | |||||
| } | |||||
| @@ -166,12 +166,12 @@ func UpdateTaskStatus(svc *svc.ServiceContext, tasklist []*types.TaskModel) { | |||||
| } | } | ||||
| func reportStatusMessages(svc *svc.ServiceContext, task *types.TaskModel, aiTask *models.TaskAi) error { | func reportStatusMessages(svc *svc.ServiceContext, task *types.TaskModel, aiTask *models.TaskAi) error { | ||||
| report := &jcs.JobStatusReportReq{ | |||||
| report := &jcs.TrainReportMessage{ | |||||
| Type: "Train", | |||||
| TaskName: task.Name, | TaskName: task.Name, | ||||
| TaskID: strconv.FormatInt(task.Id, 10), | TaskID: strconv.FormatInt(task.Id, 10), | ||||
| Messages: make([]*jcs.ReportMessage, 0), | |||||
| } | } | ||||
| //add report msg | |||||
| var output string | var output string | ||||
| switch aiTask.ClusterName { | switch aiTask.ClusterName { | ||||
| case "openI": | case "openI": | ||||
| @@ -180,17 +180,17 @@ func reportStatusMessages(svc *svc.ServiceContext, task *types.TaskModel, aiTask | |||||
| output = aiTask.Output | output = aiTask.Output | ||||
| } | } | ||||
| jobMsg := &jcs.ReportMessage{ | |||||
| Status: true, | |||||
| Message: "", | |||||
| ClusterID: strconv.FormatInt(aiTask.ClusterId, 10), | |||||
| Output: output, | |||||
| } | |||||
| report.Messages = append(report.Messages, jobMsg) | |||||
| report.Status = true | |||||
| report.Message = "" | |||||
| report.ClusterID = strconv.FormatInt(aiTask.ClusterId, 10) | |||||
| report.Output = output | |||||
| _ = jcs.StatusReport(svc.Scheduler.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report) | |||||
| err := jcs.StatusReport(svc.Scheduler.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| err := jcs.TempSaveReportToTask(svc.Scheduler.AiStorages, task, report) | |||||
| err = jcs.TempSaveReportToTask(svc.Scheduler.AiStorages, task, report) | |||||
| if err != nil { | if err != nil { | ||||
| return err | return err | ||||
| } | } | ||||