package inference import ( "context" "errors" "fmt" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option" "strconv" "sync" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types" "github.com/zeromicro/go-zero/core/logx" ) type CreateDeployTaskLogic struct { logx.Logger ctx context.Context svcCtx *svc.ServiceContext } func NewCreateDeployTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext) *CreateDeployTaskLogic { return &CreateDeployTaskLogic{ Logger: logx.WithContext(ctx), ctx: ctx, svcCtx: svcCtx, } } func (l *CreateDeployTaskLogic) CreateDeployTask(req *types.CreateDeployTaskReq) (resp *types.CreateDeployTaskResp, err error) { resp = &types.CreateDeployTaskResp{} if len(req.AdapterClusterMap) == 0 { return nil, errors.New("adapters are empty") } opt := &option.InferOption{ TaskName: req.TaskName, ModelType: req.ModelType, ModelName: req.ModelName, Cmd: "", } duplicated, err := l.svcCtx.Scheduler.AiStorages.IsDeployTaskNameDuplicated(req.TaskName) if err != nil { return nil, err } if duplicated { return nil, errors.New("TaskName already exists") } taskId, err := l.svcCtx.Scheduler.AiStorages.SaveInferDeployTask(req.TaskName, 0, req.ModelName, req.ModelType, req.TaskDesc) if err != nil { return nil, err } var clusterlen int for _, c := range req.AdapterClusterMap { clusterlen += len(c) } var errCh = make(chan interface{}, clusterlen) var errs []interface{} buf := make(chan bool, 2) var wg sync.WaitGroup for aid, v := range req.AdapterClusterMap { for _, c := range v { wg.Add(1) cid := c buf <- true go func() { err = l.createDeployInstance(taskId, aid, cid, opt) if err != nil { e := struct { err error clusterId string }{ err: err, clusterId: cid, } errCh <- e wg.Done() <-buf return } wg.Done() <-buf }() } } wg.Wait() close(errCh) for e := range errCh { errs = append(errs, e) } if len(errs) != 0 { var msg string for _, err := range errs { e := (err).(struct { err error clusterId string }) clusterName, err := l.svcCtx.Scheduler.AiStorages.GetClusterNameById(e.clusterId) if err != nil { clusterName = e.clusterId } msg += fmt.Sprintf("CreateInstance Failed # clusterName: %v, error: %v \n", clusterName, e.err.Error()) } return nil, errors.New(msg) } return } func (l *CreateDeployTaskLogic) createDeployInstance(taskId int64, adapterId string, clusterId string, opt *option.InferOption) error { cmap, found := l.svcCtx.Scheduler.AiService.InferenceAdapterMap[adapterId] if !found { return errors.New("adapterId not exist: " + adapterId) } iCluster, found := cmap[clusterId] if !found { return errors.New("clusterId not exist: " + clusterId) } insId, err := iCluster.CreateInferDeployInstance(l.ctx, opt) if err != nil { return err } aid, err := strconv.ParseInt(adapterId, 10, 64) if err != nil { return err } cid, err := strconv.ParseInt(clusterId, 10, 64) if err != nil { return err } adapterName, err := l.svcCtx.Scheduler.AiStorages.GetAdapterNameById(adapterId) if err != nil { return err } clusterName, err := l.svcCtx.Scheduler.AiStorages.GetClusterNameById(clusterId) if err != nil { return err } ins, err := iCluster.GetInferDeployInstance(l.ctx, insId) if err != nil { return err } _, err = l.svcCtx.Scheduler.AiStorages.SaveInferDeployInstance(taskId, ins.InstanceId, ins.InstanceName, aid, adapterName, cid, clusterName, ins.ModelName, ins.ModelType, ins.InferCard, ins.ClusterType) if err != nil { return err } return nil }