package tasksync import ( "github.com/zeromicro/go-zero/core/logx" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/config" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/database" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/jcs" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/storeLink" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" "net/http" "strconv" "sync" "time" ) type SyncInfer struct { mu sync.Mutex aiStorages *database.AiStorage inferenceAdapterMap map[string]map[string]inference.ICluster config *config.Config } func NewInferTask(storage *database.AiStorage, inferenceAdapterMap map[string]map[string]inference.ICluster, config *config.Config) *SyncInfer { return &SyncInfer{ aiStorages: storage, inferenceAdapterMap: inferenceAdapterMap, config: config, } } func (s *SyncInfer) UpdateDeployInstanceStatusBatch(insList []*models.AiInferDeployInstance, needfilter bool) { s.mu.Lock() defer s.mu.Unlock() list := make([]*models.AiInferDeployInstance, len(insList)) copy(list, insList) if needfilter { for i := len(list) - 1; i >= 0; i-- { if list[i].Status == constants.Running || list[i].Status == constants.Stopped || list[i].Status == constants.Failed { list = append(list[:i], list[i+1:]...) } } } if len(list) == 0 { return } buffer := make(chan bool, 3) for _, instance := range list { buffer <- true go s.UpdateDeployInstanceStatus(instance, false, buffer) } } func (s *SyncInfer) UpdateDeployTaskStatus() { list, err := s.aiStorages.GetAllDeployTasks() if err != nil { return } ins := list[0] for i := range list { uTime, _ := time.Parse(time.RFC3339, ins.UpdateTime) latest, _ := time.Parse(time.RFC3339, list[i].UpdateTime) if latest.After(uTime) { ins = list[i] } } inslist, err := s.aiStorages.GetInstanceListByDeployTaskId(ins.Id) if err != nil { return } buffer := make(chan bool, 2) for _, instance := range inslist { buffer <- true go s.UpdateDeployInstanceStatus(instance, false, buffer) } } func (s *SyncInfer) UpdateDeployInstanceStatus(instance *models.AiInferDeployInstance, updatetime bool, ch chan bool) { amap, found := s.inferenceAdapterMap[strconv.FormatInt(instance.AdapterId, 10)] if !found { if ch != nil { <-ch return } return } cmap, found := amap[strconv.FormatInt(instance.ClusterId, 10)] if !found { if ch != nil { <-ch return } return } h := http.Request{} ins, err := cmap.GetInferDeployInstance(h.Context(), instance.InstanceId) if err != nil { if ch != nil { <-ch return } return } switch instance.ClusterType { case storeLink.TYPE_OCTOPUS: switch ins.Status { case "running": if instance.Status == constants.Running { if ch != nil { <-ch return } return } url := ins.InferUrl err := s.ReportInferenceStatusMessages(instance, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), url, true, "") if err != nil { logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error()) } instance.Status = constants.Running case "stopped": if instance.Status == constants.Stopped { if ch != nil { <-ch return } return } instance.Status = constants.Stopped default: instance.Status = ins.Status } case storeLink.TYPE_MODELARTS: switch ins.Status { case "running": if instance.Status == constants.Running { if ch != nil { <-ch return } return } url := ins.InferUrl err := s.ReportInferenceStatusMessages(instance, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), url, true, "") if err != nil { logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error()) } instance.Status = constants.Running case "stopped": if instance.Status == constants.Stopped { if ch != nil { <-ch return } return } instance.Status = constants.Stopped case "failed": if instance.Status == constants.Failed { if ch != nil { <-ch return } return } err := s.ReportInferenceStatusMessages(instance, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), "", false, ins.Status) if err != nil { logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error()) } instance.Status = constants.Failed default: instance.Status = ins.Status } case storeLink.TYPE_SHUGUANGAI: switch ins.Status { case "Running": if instance.Status == constants.Running { if ch != nil { <-ch return } return } instance.Status = constants.Running case "Terminated": if instance.Status == constants.Stopped { if ch != nil { <-ch return } return } instance.Status = constants.Stopped default: instance.Status = ins.Status } case storeLink.TYPE_OPENI: switch ins.Status { case "RUNNING": if instance.Status == constants.Running { if ch != nil { <-ch return } return } url := ins.InferUrl err := s.ReportInferenceStatusMessages(instance, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), url, true, "") if err != nil { logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error()) } instance.Status = constants.Running case "STOPPED": if instance.Status == constants.Stopped { if ch != nil { <-ch return } return } instance.Status = constants.Stopped case "CREATED_FAILED": if instance.Status == constants.Failed { if ch != nil { <-ch return } return } err := s.ReportInferenceStatusMessages(instance, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), "", false, ins.Status) if err != nil { logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error()) } instance.Status = constants.Failed case "FAILED": if instance.Status == constants.Failed { if ch != nil { <-ch return } return } err := s.ReportInferenceStatusMessages(instance, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), "", false, ins.Status) if err != nil { logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error()) } instance.Status = constants.Failed default: instance.Status = ins.Status } } err = s.aiStorages.UpdateInferDeployInstance(instance, updatetime) if err != nil { if ch != nil { <-ch return } return } if ch != nil { <-ch return } } func (s *SyncInfer) UpdateAutoStoppedInstance() { list, err := s.aiStorages.GetInferDeployInstanceListLastMonth() if err != nil { return } if len(list) == 0 { return } s.UpdateDeployInstanceStatusBatch(list, false) } func (s *SyncInfer) CheckStopStatus(in *inference.DeployInstance) bool { switch in.ClusterType { case storeLink.TYPE_OCTOPUS: switch in.Status { case "stopped": return true default: return false } case storeLink.TYPE_MODELARTS: switch in.Status { case "stopped": return true default: return false } case storeLink.TYPE_SHUGUANGAI: switch in.Status { case "Terminated": return true default: return false } case storeLink.TYPE_OPENI: switch in.Status { case "STOPPED": return true default: return false } default: return false } } func (s *SyncInfer) CheckRunningStatus(in *inference.DeployInstance) bool { switch in.ClusterType { case storeLink.TYPE_OCTOPUS: switch in.Status { case "running": return true default: return false } case storeLink.TYPE_MODELARTS: switch in.Status { case "running": return true default: return false } case storeLink.TYPE_SHUGUANGAI: switch in.Status { case "Running": return true default: return false } case storeLink.TYPE_OPENI: switch in.Status { case "RUNNING": return true case "WAITING": return true default: return false } default: return false } } func (s *SyncInfer) ReportInferenceStatusMessages(ins *models.AiInferDeployInstance, taskName string, taskId string, clusterId string, url string, status bool, msg string) error { var id string var adapterID string var clusterID string var instanceID string if ins != nil { id = strconv.FormatInt(ins.Id, 10) adapterID = strconv.FormatInt(ins.AdapterId, 10) clusterID = strconv.FormatInt(ins.ClusterId, 10) instanceID = ins.InstanceId } report := &jcs.JobStatusReportReq{} reportMsg := &jcs.InferReportMessage{ Type: "Inference", TaskName: taskName, TaskID: taskId, Status: status, Message: msg, Url: url, ID: id, AdapterID: adapterID, ClusterID: clusterID, InstanceID: instanceID, } report.Report = reportMsg err := jcs.StatusReport(s.config.JcsMiddleware.JobStatusReportUrl, report) if err != nil { return err } return nil }