|
|
|
@@ -0,0 +1,390 @@ |
|
|
|
package tasksync |
|
|
|
|
|
|
|
import ( |
|
|
|
"github.com/zeromicro/go-zero/core/logx" |
|
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/config" |
|
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/database" |
|
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference" |
|
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/jcs" |
|
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/storeLink" |
|
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" |
|
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" |
|
|
|
"net/http" |
|
|
|
"strconv" |
|
|
|
"sync" |
|
|
|
"time" |
|
|
|
) |
|
|
|
|
|
|
|
type SyncInfer struct { |
|
|
|
mu sync.Mutex |
|
|
|
aiStorages *database.AiStorage |
|
|
|
inferenceAdapterMap map[string]map[string]inference.ICluster |
|
|
|
config *config.Config |
|
|
|
} |
|
|
|
|
|
|
|
func NewInferTask(storage *database.AiStorage, inferenceAdapterMap map[string]map[string]inference.ICluster, config *config.Config) *SyncInfer { |
|
|
|
return &SyncInfer{ |
|
|
|
aiStorages: storage, |
|
|
|
inferenceAdapterMap: inferenceAdapterMap, |
|
|
|
config: config, |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
func (s *SyncInfer) UpdateDeployInstanceStatusBatch(insList []*models.AiInferDeployInstance, needfilter bool) { |
|
|
|
s.mu.Lock() |
|
|
|
defer s.mu.Unlock() |
|
|
|
list := make([]*models.AiInferDeployInstance, len(insList)) |
|
|
|
copy(list, insList) |
|
|
|
|
|
|
|
if needfilter { |
|
|
|
for i := len(list) - 1; i >= 0; i-- { |
|
|
|
if list[i].Status == constants.Running || list[i].Status == constants.Stopped || list[i].Status == constants.Failed { |
|
|
|
list = append(list[:i], list[i+1:]...) |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
if len(list) == 0 { |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
buffer := make(chan bool, 3) |
|
|
|
for _, instance := range list { |
|
|
|
buffer <- true |
|
|
|
go s.UpdateDeployInstanceStatus(instance, false, buffer) |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
func (s *SyncInfer) UpdateDeployTaskStatus() { |
|
|
|
list, err := s.aiStorages.GetAllDeployTasks() |
|
|
|
if err != nil { |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
ins := list[0] |
|
|
|
for i := range list { |
|
|
|
uTime, _ := time.Parse(time.RFC3339, ins.UpdateTime) |
|
|
|
latest, _ := time.Parse(time.RFC3339, list[i].UpdateTime) |
|
|
|
if latest.After(uTime) { |
|
|
|
ins = list[i] |
|
|
|
} |
|
|
|
} |
|
|
|
inslist, err := s.aiStorages.GetInstanceListByDeployTaskId(ins.Id) |
|
|
|
if err != nil { |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
buffer := make(chan bool, 2) |
|
|
|
for _, instance := range inslist { |
|
|
|
buffer <- true |
|
|
|
go s.UpdateDeployInstanceStatus(instance, false, buffer) |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
func (s *SyncInfer) UpdateDeployInstanceStatus(instance *models.AiInferDeployInstance, updatetime bool, ch chan bool) { |
|
|
|
amap, found := s.inferenceAdapterMap[strconv.FormatInt(instance.AdapterId, 10)] |
|
|
|
if !found { |
|
|
|
if ch != nil { |
|
|
|
<-ch |
|
|
|
return |
|
|
|
} |
|
|
|
return |
|
|
|
} |
|
|
|
cmap, found := amap[strconv.FormatInt(instance.ClusterId, 10)] |
|
|
|
if !found { |
|
|
|
if ch != nil { |
|
|
|
<-ch |
|
|
|
return |
|
|
|
} |
|
|
|
return |
|
|
|
} |
|
|
|
h := http.Request{} |
|
|
|
ins, err := cmap.GetInferDeployInstance(h.Context(), instance.InstanceId) |
|
|
|
if err != nil { |
|
|
|
if ch != nil { |
|
|
|
<-ch |
|
|
|
return |
|
|
|
} |
|
|
|
return |
|
|
|
} |
|
|
|
switch instance.ClusterType { |
|
|
|
case storeLink.TYPE_OCTOPUS: |
|
|
|
switch ins.Status { |
|
|
|
case "running": |
|
|
|
if instance.Status == constants.Running { |
|
|
|
if ch != nil { |
|
|
|
<-ch |
|
|
|
return |
|
|
|
} |
|
|
|
return |
|
|
|
} |
|
|
|
url := ins.InferUrl |
|
|
|
err := s.ReportInferenceStatusMessages(instance, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), url, true, "") |
|
|
|
if err != nil { |
|
|
|
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error()) |
|
|
|
} |
|
|
|
instance.Status = constants.Running |
|
|
|
case "stopped": |
|
|
|
if instance.Status == constants.Stopped { |
|
|
|
if ch != nil { |
|
|
|
<-ch |
|
|
|
return |
|
|
|
} |
|
|
|
return |
|
|
|
} |
|
|
|
instance.Status = constants.Stopped |
|
|
|
default: |
|
|
|
instance.Status = ins.Status |
|
|
|
} |
|
|
|
case storeLink.TYPE_MODELARTS: |
|
|
|
switch ins.Status { |
|
|
|
case "running": |
|
|
|
if instance.Status == constants.Running { |
|
|
|
if ch != nil { |
|
|
|
<-ch |
|
|
|
return |
|
|
|
} |
|
|
|
return |
|
|
|
} |
|
|
|
url := ins.InferUrl |
|
|
|
err := s.ReportInferenceStatusMessages(instance, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), url, true, "") |
|
|
|
if err != nil { |
|
|
|
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error()) |
|
|
|
} |
|
|
|
instance.Status = constants.Running |
|
|
|
case "stopped": |
|
|
|
if instance.Status == constants.Stopped { |
|
|
|
if ch != nil { |
|
|
|
<-ch |
|
|
|
return |
|
|
|
} |
|
|
|
return |
|
|
|
} |
|
|
|
instance.Status = constants.Stopped |
|
|
|
case "failed": |
|
|
|
if instance.Status == constants.Failed { |
|
|
|
if ch != nil { |
|
|
|
<-ch |
|
|
|
return |
|
|
|
} |
|
|
|
return |
|
|
|
} |
|
|
|
err := s.ReportInferenceStatusMessages(instance, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), "", false, ins.Status) |
|
|
|
if err != nil { |
|
|
|
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error()) |
|
|
|
} |
|
|
|
instance.Status = constants.Failed |
|
|
|
default: |
|
|
|
instance.Status = ins.Status |
|
|
|
} |
|
|
|
case storeLink.TYPE_SHUGUANGAI: |
|
|
|
switch ins.Status { |
|
|
|
case "Running": |
|
|
|
if instance.Status == constants.Running { |
|
|
|
if ch != nil { |
|
|
|
<-ch |
|
|
|
return |
|
|
|
} |
|
|
|
return |
|
|
|
} |
|
|
|
instance.Status = constants.Running |
|
|
|
case "Terminated": |
|
|
|
if instance.Status == constants.Stopped { |
|
|
|
if ch != nil { |
|
|
|
<-ch |
|
|
|
return |
|
|
|
} |
|
|
|
return |
|
|
|
} |
|
|
|
instance.Status = constants.Stopped |
|
|
|
default: |
|
|
|
instance.Status = ins.Status |
|
|
|
} |
|
|
|
case storeLink.TYPE_OPENI: |
|
|
|
switch ins.Status { |
|
|
|
case "RUNNING": |
|
|
|
if instance.Status == constants.Running { |
|
|
|
if ch != nil { |
|
|
|
<-ch |
|
|
|
return |
|
|
|
} |
|
|
|
return |
|
|
|
} |
|
|
|
url := ins.InferUrl |
|
|
|
err := s.ReportInferenceStatusMessages(instance, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), url, true, "") |
|
|
|
if err != nil { |
|
|
|
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error()) |
|
|
|
} |
|
|
|
instance.Status = constants.Running |
|
|
|
case "STOPPED": |
|
|
|
if instance.Status == constants.Stopped { |
|
|
|
if ch != nil { |
|
|
|
<-ch |
|
|
|
return |
|
|
|
} |
|
|
|
return |
|
|
|
} |
|
|
|
instance.Status = constants.Stopped |
|
|
|
case "CREATED_FAILED": |
|
|
|
if instance.Status == constants.Failed { |
|
|
|
if ch != nil { |
|
|
|
<-ch |
|
|
|
return |
|
|
|
} |
|
|
|
return |
|
|
|
} |
|
|
|
err := s.ReportInferenceStatusMessages(instance, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), "", false, ins.Status) |
|
|
|
if err != nil { |
|
|
|
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error()) |
|
|
|
} |
|
|
|
instance.Status = constants.Failed |
|
|
|
case "FAILED": |
|
|
|
if instance.Status == constants.Failed { |
|
|
|
if ch != nil { |
|
|
|
<-ch |
|
|
|
return |
|
|
|
} |
|
|
|
return |
|
|
|
} |
|
|
|
err := s.ReportInferenceStatusMessages(instance, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), "", false, ins.Status) |
|
|
|
if err != nil { |
|
|
|
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error()) |
|
|
|
} |
|
|
|
instance.Status = constants.Failed |
|
|
|
default: |
|
|
|
instance.Status = ins.Status |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
err = s.aiStorages.UpdateInferDeployInstance(instance, updatetime) |
|
|
|
if err != nil { |
|
|
|
if ch != nil { |
|
|
|
<-ch |
|
|
|
return |
|
|
|
} |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
if ch != nil { |
|
|
|
<-ch |
|
|
|
return |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
func (s *SyncInfer) UpdateAutoStoppedInstance() { |
|
|
|
list, err := s.aiStorages.GetInferDeployInstanceList() |
|
|
|
if err != nil { |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
if len(list) == 0 { |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
s.UpdateDeployInstanceStatusBatch(list, false) |
|
|
|
} |
|
|
|
|
|
|
|
func (s *SyncInfer) CheckStopStatus(in *inference.DeployInstance) bool { |
|
|
|
switch in.ClusterType { |
|
|
|
case storeLink.TYPE_OCTOPUS: |
|
|
|
switch in.Status { |
|
|
|
case "stopped": |
|
|
|
return true |
|
|
|
default: |
|
|
|
return false |
|
|
|
} |
|
|
|
case storeLink.TYPE_MODELARTS: |
|
|
|
switch in.Status { |
|
|
|
case "stopped": |
|
|
|
return true |
|
|
|
default: |
|
|
|
return false |
|
|
|
} |
|
|
|
case storeLink.TYPE_SHUGUANGAI: |
|
|
|
switch in.Status { |
|
|
|
case "Terminated": |
|
|
|
return true |
|
|
|
default: |
|
|
|
return false |
|
|
|
} |
|
|
|
case storeLink.TYPE_OPENI: |
|
|
|
switch in.Status { |
|
|
|
case "STOPPED": |
|
|
|
return true |
|
|
|
default: |
|
|
|
return false |
|
|
|
} |
|
|
|
default: |
|
|
|
return false |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
func (s *SyncInfer) CheckRunningStatus(in *inference.DeployInstance) bool { |
|
|
|
switch in.ClusterType { |
|
|
|
case storeLink.TYPE_OCTOPUS: |
|
|
|
switch in.Status { |
|
|
|
case "running": |
|
|
|
return true |
|
|
|
default: |
|
|
|
return false |
|
|
|
} |
|
|
|
case storeLink.TYPE_MODELARTS: |
|
|
|
switch in.Status { |
|
|
|
case "running": |
|
|
|
return true |
|
|
|
default: |
|
|
|
return false |
|
|
|
} |
|
|
|
case storeLink.TYPE_SHUGUANGAI: |
|
|
|
switch in.Status { |
|
|
|
case "Running": |
|
|
|
return true |
|
|
|
default: |
|
|
|
return false |
|
|
|
} |
|
|
|
case storeLink.TYPE_OPENI: |
|
|
|
switch in.Status { |
|
|
|
case "RUNNING": |
|
|
|
return true |
|
|
|
case "WAITING": |
|
|
|
return true |
|
|
|
default: |
|
|
|
return false |
|
|
|
} |
|
|
|
default: |
|
|
|
return false |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
func (s *SyncInfer) ReportInferenceStatusMessages(ins *models.AiInferDeployInstance, taskName string, taskId string, clusterId string, url string, status bool, msg string) error { |
|
|
|
var id string |
|
|
|
var adapterID string |
|
|
|
var clusterID string |
|
|
|
var instanceID string |
|
|
|
if ins != nil { |
|
|
|
id = strconv.FormatInt(ins.Id, 10) |
|
|
|
adapterID = strconv.FormatInt(ins.AdapterId, 10) |
|
|
|
clusterID = strconv.FormatInt(ins.ClusterId, 10) |
|
|
|
instanceID = ins.InstanceId |
|
|
|
} |
|
|
|
report := &jcs.JobStatusReportReq{} |
|
|
|
reportMsg := &jcs.InferReportMessage{ |
|
|
|
Type: "Inference", |
|
|
|
TaskName: taskName, |
|
|
|
TaskID: taskId, |
|
|
|
Status: status, |
|
|
|
Message: msg, |
|
|
|
Url: url, |
|
|
|
ID: id, |
|
|
|
AdapterID: adapterID, |
|
|
|
ClusterID: clusterID, |
|
|
|
InstanceID: instanceID, |
|
|
|
} |
|
|
|
report.Report = reportMsg |
|
|
|
|
|
|
|
err := jcs.StatusReport(s.config.JcsMiddleware.JobStatusReportUrl, report) |
|
|
|
if err != nil { |
|
|
|
return err |
|
|
|
} |
|
|
|
return nil |
|
|
|
} |