|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390 |
- package tasksync
-
- import (
- "github.com/zeromicro/go-zero/core/logx"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/config"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/database"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/jcs"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/storeLink"
- "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
- "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
- "net/http"
- "strconv"
- "sync"
- "time"
- )
-
- type SyncInfer struct {
- mu sync.Mutex
- aiStorages *database.AiStorage
- inferenceAdapterMap map[string]map[string]inference.ICluster
- config *config.Config
- }
-
- func NewInferTask(storage *database.AiStorage, inferenceAdapterMap map[string]map[string]inference.ICluster, config *config.Config) *SyncInfer {
- return &SyncInfer{
- aiStorages: storage,
- inferenceAdapterMap: inferenceAdapterMap,
- config: config,
- }
- }
-
- func (s *SyncInfer) UpdateDeployInstanceStatusBatch(insList []*models.AiInferDeployInstance, needfilter bool) {
- s.mu.Lock()
- defer s.mu.Unlock()
- list := make([]*models.AiInferDeployInstance, len(insList))
- copy(list, insList)
-
- if needfilter {
- for i := len(list) - 1; i >= 0; i-- {
- if list[i].Status == constants.Running || list[i].Status == constants.Stopped || list[i].Status == constants.Failed {
- list = append(list[:i], list[i+1:]...)
- }
- }
- }
-
- if len(list) == 0 {
- return
- }
-
- buffer := make(chan bool, 3)
- for _, instance := range list {
- buffer <- true
- go s.UpdateDeployInstanceStatus(instance, false, buffer)
- }
- }
-
- func (s *SyncInfer) UpdateDeployTaskStatus() {
- list, err := s.aiStorages.GetAllDeployTasks()
- if err != nil {
- return
- }
-
- ins := list[0]
- for i := range list {
- uTime, _ := time.Parse(time.RFC3339, ins.UpdateTime)
- latest, _ := time.Parse(time.RFC3339, list[i].UpdateTime)
- if latest.After(uTime) {
- ins = list[i]
- }
- }
- inslist, err := s.aiStorages.GetInstanceListByDeployTaskId(ins.Id)
- if err != nil {
- return
- }
-
- buffer := make(chan bool, 2)
- for _, instance := range inslist {
- buffer <- true
- go s.UpdateDeployInstanceStatus(instance, false, buffer)
- }
- }
-
- func (s *SyncInfer) UpdateDeployInstanceStatus(instance *models.AiInferDeployInstance, updatetime bool, ch chan bool) {
- amap, found := s.inferenceAdapterMap[strconv.FormatInt(instance.AdapterId, 10)]
- if !found {
- if ch != nil {
- <-ch
- return
- }
- return
- }
- cmap, found := amap[strconv.FormatInt(instance.ClusterId, 10)]
- if !found {
- if ch != nil {
- <-ch
- return
- }
- return
- }
- h := http.Request{}
- ins, err := cmap.GetInferDeployInstance(h.Context(), instance.InstanceId)
- if err != nil {
- if ch != nil {
- <-ch
- return
- }
- return
- }
- switch instance.ClusterType {
- case storeLink.TYPE_OCTOPUS:
- switch ins.Status {
- case "running":
- if instance.Status == constants.Running {
- if ch != nil {
- <-ch
- return
- }
- return
- }
- url := ins.InferUrl
- err := s.ReportInferenceStatusMessages(instance, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), url, true, "")
- if err != nil {
- logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
- }
- instance.Status = constants.Running
- case "stopped":
- if instance.Status == constants.Stopped {
- if ch != nil {
- <-ch
- return
- }
- return
- }
- instance.Status = constants.Stopped
- default:
- instance.Status = ins.Status
- }
- case storeLink.TYPE_MODELARTS:
- switch ins.Status {
- case "running":
- if instance.Status == constants.Running {
- if ch != nil {
- <-ch
- return
- }
- return
- }
- url := ins.InferUrl
- err := s.ReportInferenceStatusMessages(instance, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), url, true, "")
- if err != nil {
- logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
- }
- instance.Status = constants.Running
- case "stopped":
- if instance.Status == constants.Stopped {
- if ch != nil {
- <-ch
- return
- }
- return
- }
- instance.Status = constants.Stopped
- case "failed":
- if instance.Status == constants.Failed {
- if ch != nil {
- <-ch
- return
- }
- return
- }
- err := s.ReportInferenceStatusMessages(instance, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), "", false, ins.Status)
- if err != nil {
- logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
- }
- instance.Status = constants.Failed
- default:
- instance.Status = ins.Status
- }
- case storeLink.TYPE_SHUGUANGAI:
- switch ins.Status {
- case "Running":
- if instance.Status == constants.Running {
- if ch != nil {
- <-ch
- return
- }
- return
- }
- instance.Status = constants.Running
- case "Terminated":
- if instance.Status == constants.Stopped {
- if ch != nil {
- <-ch
- return
- }
- return
- }
- instance.Status = constants.Stopped
- default:
- instance.Status = ins.Status
- }
- case storeLink.TYPE_OPENI:
- switch ins.Status {
- case "RUNNING":
- if instance.Status == constants.Running {
- if ch != nil {
- <-ch
- return
- }
- return
- }
- url := ins.InferUrl
- err := s.ReportInferenceStatusMessages(instance, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), url, true, "")
- if err != nil {
- logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
- }
- instance.Status = constants.Running
- case "STOPPED":
- if instance.Status == constants.Stopped {
- if ch != nil {
- <-ch
- return
- }
- return
- }
- instance.Status = constants.Stopped
- case "CREATED_FAILED":
- if instance.Status == constants.Failed {
- if ch != nil {
- <-ch
- return
- }
- return
- }
- err := s.ReportInferenceStatusMessages(instance, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), "", false, ins.Status)
- if err != nil {
- logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
- }
- instance.Status = constants.Failed
- case "FAILED":
- if instance.Status == constants.Failed {
- if ch != nil {
- <-ch
- return
- }
- return
- }
- err := s.ReportInferenceStatusMessages(instance, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), "", false, ins.Status)
- if err != nil {
- logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
- }
- instance.Status = constants.Failed
- default:
- instance.Status = ins.Status
- }
- }
-
- err = s.aiStorages.UpdateInferDeployInstance(instance, updatetime)
- if err != nil {
- if ch != nil {
- <-ch
- return
- }
- return
- }
-
- if ch != nil {
- <-ch
- return
- }
- }
-
- func (s *SyncInfer) UpdateAutoStoppedInstance() {
- list, err := s.aiStorages.GetInferDeployInstanceListLastMonth()
- if err != nil {
- return
- }
-
- if len(list) == 0 {
- return
- }
-
- s.UpdateDeployInstanceStatusBatch(list, false)
- }
-
- func (s *SyncInfer) CheckStopStatus(in *inference.DeployInstance) bool {
- switch in.ClusterType {
- case storeLink.TYPE_OCTOPUS:
- switch in.Status {
- case "stopped":
- return true
- default:
- return false
- }
- case storeLink.TYPE_MODELARTS:
- switch in.Status {
- case "stopped":
- return true
- default:
- return false
- }
- case storeLink.TYPE_SHUGUANGAI:
- switch in.Status {
- case "Terminated":
- return true
- default:
- return false
- }
- case storeLink.TYPE_OPENI:
- switch in.Status {
- case "STOPPED":
- return true
- default:
- return false
- }
- default:
- return false
- }
- }
-
- func (s *SyncInfer) CheckRunningStatus(in *inference.DeployInstance) bool {
- switch in.ClusterType {
- case storeLink.TYPE_OCTOPUS:
- switch in.Status {
- case "running":
- return true
- default:
- return false
- }
- case storeLink.TYPE_MODELARTS:
- switch in.Status {
- case "running":
- return true
- default:
- return false
- }
- case storeLink.TYPE_SHUGUANGAI:
- switch in.Status {
- case "Running":
- return true
- default:
- return false
- }
- case storeLink.TYPE_OPENI:
- switch in.Status {
- case "RUNNING":
- return true
- case "WAITING":
- return true
- default:
- return false
- }
- default:
- return false
- }
- }
-
- func (s *SyncInfer) ReportInferenceStatusMessages(ins *models.AiInferDeployInstance, taskName string, taskId string, clusterId string, url string, status bool, msg string) error {
- var id string
- var adapterID string
- var clusterID string
- var instanceID string
- if ins != nil {
- id = strconv.FormatInt(ins.Id, 10)
- adapterID = strconv.FormatInt(ins.AdapterId, 10)
- clusterID = strconv.FormatInt(ins.ClusterId, 10)
- instanceID = ins.InstanceId
- }
- report := &jcs.JobStatusReportReq{}
- reportMsg := &jcs.InferReportMessage{
- Type: "Inference",
- TaskName: taskName,
- TaskID: taskId,
- Status: status,
- Message: msg,
- Url: url,
- ID: id,
- AdapterID: adapterID,
- ClusterID: clusterID,
- InstanceID: instanceID,
- }
- report.Report = reportMsg
-
- err := jcs.StatusReport(s.config.JcsMiddleware.JobStatusReportUrl, report)
- if err != nil {
- return err
- }
- return nil
- }
|