package inference import ( "context" "encoding/json" "errors" "fmt" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/entity" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/task" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy" "strconv" "strings" "sync" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types" "github.com/zeromicro/go-zero/core/logx" ) type CreateInferenceTaskLogic struct { logx.Logger ctx context.Context svcCtx *svc.ServiceContext } func NewCreateInferenceTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext) *CreateInferenceTaskLogic { return &CreateInferenceTaskLogic{ Logger: logx.WithContext(ctx), ctx: ctx, svcCtx: svcCtx, } } const AdapterId = "1777144940459986944" func (l *CreateInferenceTaskLogic) CreateInferenceTask(req *types.CreateInferenceTaskReq) (resp *types.CreateInferenceTaskResp, err error) { resp = &types.CreateInferenceTaskResp{} err = task.ValidateJobResources(req.JobResources, "inference") if err != nil { return nil, err } clusters, err := generateClustersForTaskCreation(req.DataDistributes, req.Name) if err != nil { return nil, err } modelName, err := generateModelName(clusters) if err != nil { return nil, err } taskName, err := l.svcCtx.Scheduler.AiService.HandleDuplicateTaskName(req.Name, "inference") if err != nil { return nil, err } assignedClusters := task.CopyParams(clusters, req.JobResources.Clusters, "inference") opt := &option.InferOption{ TaskName: taskName, TaskDesc: req.Description, ModelType: "", ModelName: modelName, Cmd: "", } taskId, err := l.svcCtx.Scheduler.AiStorages.SaveInferDeployTask(taskName, modelName, "", req.Description) if err != nil { return nil, err } adapterClusterMap := make(map[string][]*strategy.AssignedCluster) adapterClusterMap[AdapterId] = assignedClusters err = l.createInferenceTask(taskId, adapterClusterMap, opt) if err != nil { return nil, err } resp.TaskId = strconv.FormatInt(taskId, 10) resp.TaskName = taskName return } func (l *CreateInferenceTaskLogic) createInferenceTask(taskId int64, adapterClusterMap map[string][]*strategy.AssignedCluster, option *option.InferOption) error { var clusterlen int for _, c := range adapterClusterMap { clusterlen += len(c) } var errCh = make(chan interface{}, clusterlen) var errs []interface{} buf := make(chan bool, 2) var wg sync.WaitGroup for aid, v := range adapterClusterMap { for _, c := range v { wg.Add(1) cluster := c buf <- true go func() { opt, _ := cloneOption(option) updateInferOption(cluster, opt) err := l.createDeployInstance(taskId, aid, cluster.ClusterId, opt) if err != nil { e := struct { err error clusterId string }{ err: err, clusterId: cluster.ClusterId, } errCh <- e wg.Done() <-buf return } wg.Done() <-buf }() } } wg.Wait() close(errCh) for e := range errCh { errs = append(errs, e) } if len(errs) != 0 { var msg string for _, err := range errs { e := (err).(struct { err error clusterId string }) clusterName, err := l.svcCtx.Scheduler.AiStorages.GetClusterNameById(e.clusterId) if err != nil { clusterName = e.clusterId } msg += fmt.Sprintf("CreateInstance Failed # clusterName: %v, error: %v \n", clusterName, e.err.Error()) } return errors.New(msg) } return nil } func updateInferOption(cluster *strategy.AssignedCluster, opt *option.InferOption) { opt.Cmd = cluster.Cmd opt.Envs = cluster.Envs opt.Params = cluster.Params opt.ImageId = cluster.ImageId opt.AlgorithmId = cluster.CodeId opt.ModelID = cluster.ModelId opt.ResourcesRequired = cluster.ResourcesRequired opt.Output = cluster.Output } func generateClustersForTaskCreation(distributes types.DataDistribute, taskName string) ([]*strategy.AssignedCluster, error) { var assignedClusters []*strategy.AssignedCluster clusterMap := make(map[string]*strategy.AssignedCluster) for _, distribute := range distributes.Model { if len(distribute.Clusters) == 0 { return nil, fmt.Errorf("Model distribute: must specify at least one cluster") } for _, c := range distribute.Clusters { if c.ClusterID == "" { return nil, fmt.Errorf("Model distribute: clusterId can not be empty") } cluster := &strategy.AssignedCluster{} cluster.ClusterId = c.ClusterID jsonData := entity.JsonData{} err := json.Unmarshal([]byte(c.JsonData), &jsonData) if err != nil { return nil, fmt.Errorf("jsonData convert failed, task %d, cluster %s, datatype: %s", taskName, c.ClusterID, "Model") } if jsonData.Id == "" { continue } cluster.ModelId = jsonData.Id cluster.ModelName = jsonData.Name clusterMap[c.ClusterID] = cluster } } for _, distribute := range distributes.Code { if len(distribute.Clusters) == 0 { return nil, fmt.Errorf("Code distribute: must specify at least one cluster") } for _, c := range distribute.Clusters { if c.ClusterID == "" { return nil, fmt.Errorf("Code distribute: clusterId can not be empty") } jsonData := entity.JsonData{} err := json.Unmarshal([]byte(c.JsonData), &jsonData) if err != nil { return nil, fmt.Errorf("jsonData convert failed, task %d, cluster %s, datatype: %s", taskName, c.ClusterID, "Code") } if jsonData.Id == "" { continue } cluster, ok := clusterMap[c.ClusterID] if ok { cluster.CodeId = jsonData.Id } } } for _, distribute := range distributes.Image { if len(distribute.Clusters) == 0 { return nil, fmt.Errorf("Image distribute: must specify at least one cluster") } for _, c := range distribute.Clusters { if c.ClusterID == "" { return nil, fmt.Errorf("Image distribute: clusterId can not be empty") } jsonData := entity.JsonData{} err := json.Unmarshal([]byte(c.JsonData), &jsonData) if err != nil { return nil, fmt.Errorf("jsonData convert failed, task %d, cluster %s, datatype: %s", taskName, c.ClusterID, "Image") } cluster, ok := clusterMap[c.ClusterID] if ok { cluster.ImageId = jsonData.Id } } } for _, c := range clusterMap { if c.ModelId == "" { return nil, fmt.Errorf("create inference task failed, cluster %s, empty data : %s", c.ClusterId, "ModelId") } if c.CodeId == "" { return nil, fmt.Errorf("create inference task failed, cluster %s, empty data : %s", c.ClusterId, "CodeId") } if c.ImageId == "" { return nil, fmt.Errorf("create inference task failed, cluster %s, empty data : %s", c.ClusterId, "ImageId") } assignedClusters = append(assignedClusters, c) } if len(assignedClusters) == 0 { return nil, fmt.Errorf("no model provided") } return assignedClusters, nil } func generateModelName(clusters []*strategy.AssignedCluster) (string, error) { if len(clusters) == 1 { return clusters[0].ModelName, nil } var modelName string for _, c := range clusters { modelName += c.ModelName + "," } modelName = strings.TrimSuffix(modelName, ",") return modelName, nil } func (l *CreateInferenceTaskLogic) createDeployInstance(taskId int64, adapterId string, clusterId string, opt *option.InferOption) error { cmap, found := l.svcCtx.Scheduler.AiService.InferenceAdapterMap[adapterId] if !found { return errors.New("adapterId not exist: " + adapterId) } iCluster, found := cmap[clusterId] if !found { return errors.New("clusterId not exist: " + clusterId) } insId, err := iCluster.CreateInferDeployInstance(l.ctx, opt) if err != nil { return err } aid, err := strconv.ParseInt(adapterId, 10, 64) if err != nil { return err } cid, err := strconv.ParseInt(clusterId, 10, 64) if err != nil { return err } adapterName, err := l.svcCtx.Scheduler.AiStorages.GetAdapterNameById(adapterId) if err != nil { return err } clusterName, err := l.svcCtx.Scheduler.AiStorages.GetClusterNameById(clusterId) if err != nil { return err } ins, err := iCluster.GetInferDeployInstance(l.ctx, insId) if err != nil { return err } _, err = l.svcCtx.Scheduler.AiStorages.SaveInferDeployInstance(taskId, ins.InstanceId, ins.InstanceName, aid, adapterName, cid, clusterName, ins.ModelName, ins.ModelType, ins.InferCard, ins.ClusterType) if err != nil { return err } return nil } func cloneOption(opt *option.InferOption) (*option.InferOption, error) { origJSON, err := json.Marshal(opt) if err != nil { return nil, err } clone := option.InferOption{} if err = json.Unmarshal(origJSON, &clone); err != nil { return nil, err } return &clone, nil }