|
- package inference
-
- import (
- "context"
- "encoding/json"
- "errors"
- "fmt"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/entity"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/task"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy"
- "strconv"
- "strings"
- "sync"
-
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
-
- "github.com/zeromicro/go-zero/core/logx"
- )
-
- type CreateInferenceTaskLogic struct {
- logx.Logger
- ctx context.Context
- svcCtx *svc.ServiceContext
- }
-
- func NewCreateInferenceTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext) *CreateInferenceTaskLogic {
- return &CreateInferenceTaskLogic{
- Logger: logx.WithContext(ctx),
- ctx: ctx,
- svcCtx: svcCtx,
- }
- }
-
- const AdapterId = "1777144940459986944"
-
- func (l *CreateInferenceTaskLogic) CreateInferenceTask(req *types.CreateInferenceTaskReq) (resp *types.CreateInferenceTaskResp, err error) {
- resp = &types.CreateInferenceTaskResp{}
-
- err = task.ValidateJobResources(req.JobResources, "inference")
- if err != nil {
- return nil, err
- }
-
- clusters, err := generateClustersForTaskCreation(req.DataDistributes, req.Name)
- if err != nil {
- return nil, err
- }
-
- modelName, err := generateModelName(clusters)
- if err != nil {
- return nil, err
- }
-
- taskName, err := l.svcCtx.Scheduler.AiService.HandleDuplicateTaskName(req.Name, "inference")
- if err != nil {
- return nil, err
- }
-
- assignedClusters := task.CopyParams(clusters, req.JobResources.Clusters, "inference")
-
- opt := &option.InferOption{
- TaskName: taskName,
- TaskDesc: req.Description,
- ModelType: "",
- ModelName: modelName,
- Cmd: "",
- }
-
- taskId, err := l.svcCtx.Scheduler.AiStorages.SaveInferDeployTask(taskName, modelName, "", req.Description)
- if err != nil {
- return nil, err
- }
-
- adapterClusterMap := make(map[string][]*strategy.AssignedCluster)
- adapterClusterMap[AdapterId] = assignedClusters
-
- err = l.createInferenceTask(taskId, adapterClusterMap, opt)
- if err != nil {
- return nil, err
- }
-
- resp.TaskId = strconv.FormatInt(taskId, 10)
- resp.TaskName = taskName
- return
- }
-
- func (l *CreateInferenceTaskLogic) createInferenceTask(taskId int64, adapterClusterMap map[string][]*strategy.AssignedCluster, option *option.InferOption) error {
- var clusterlen int
- for _, c := range adapterClusterMap {
- clusterlen += len(c)
- }
- var errCh = make(chan interface{}, clusterlen)
- var errs []interface{}
-
- buf := make(chan bool, 2)
- var wg sync.WaitGroup
- for aid, v := range adapterClusterMap {
- for _, c := range v {
- wg.Add(1)
- cluster := c
- buf <- true
- go func() {
- opt, _ := cloneOption(option)
-
- updateInferOption(cluster, opt)
-
- err := l.createDeployInstance(taskId, aid, cluster.ClusterId, opt)
- if err != nil {
- e := struct {
- err error
- clusterId string
- }{
- err: err,
- clusterId: cluster.ClusterId,
- }
- errCh <- e
- wg.Done()
- <-buf
- return
- }
- wg.Done()
- <-buf
- }()
- }
- }
- wg.Wait()
- close(errCh)
-
- for e := range errCh {
- errs = append(errs, e)
- }
-
- if len(errs) != 0 {
- var msg string
- for _, err := range errs {
- e := (err).(struct {
- err error
- clusterId string
- })
-
- clusterName, err := l.svcCtx.Scheduler.AiStorages.GetClusterNameById(e.clusterId)
- if err != nil {
- clusterName = e.clusterId
- }
-
- msg += fmt.Sprintf("CreateInstance Failed # clusterName: %v, error: %v \n", clusterName, e.err.Error())
-
- }
- return errors.New(msg)
- }
-
- return nil
- }
-
- func updateInferOption(cluster *strategy.AssignedCluster, opt *option.InferOption) {
- opt.Cmd = cluster.Cmd
- opt.Envs = cluster.Envs
- opt.Params = cluster.Params
-
- opt.ImageId = cluster.ImageId
- opt.AlgorithmId = cluster.CodeId
- opt.ModelID = cluster.ModelId
-
- opt.ResourcesRequired = cluster.ResourcesRequired
-
- opt.Output = cluster.Output
- }
-
- func generateClustersForTaskCreation(distributes types.DataDistribute, taskName string) ([]*strategy.AssignedCluster, error) {
- var assignedClusters []*strategy.AssignedCluster
- clusterMap := make(map[string]*strategy.AssignedCluster)
-
- for _, distribute := range distributes.Model {
- if len(distribute.Clusters) == 0 {
- return nil, fmt.Errorf("Model distribute: must specify at least one cluster")
- }
-
- for _, c := range distribute.Clusters {
- if c.ClusterID == "" {
- return nil, fmt.Errorf("Model distribute: clusterId can not be empty")
- }
- cluster := &strategy.AssignedCluster{}
- cluster.ClusterId = c.ClusterID
-
- jsonData := entity.JsonData{}
- err := json.Unmarshal([]byte(c.JsonData), &jsonData)
- if err != nil {
- return nil, fmt.Errorf("jsonData convert failed, task %d, cluster %s, datatype: %s", taskName, c.ClusterID, "Model")
- }
- if jsonData.Id == "" {
- continue
- }
- cluster.ModelId = jsonData.Id
- cluster.ModelName = jsonData.Name
- clusterMap[c.ClusterID] = cluster
- }
-
- }
-
- for _, distribute := range distributes.Code {
- if len(distribute.Clusters) == 0 {
- return nil, fmt.Errorf("Code distribute: must specify at least one cluster")
- }
- for _, c := range distribute.Clusters {
- if c.ClusterID == "" {
- return nil, fmt.Errorf("Code distribute: clusterId can not be empty")
- }
-
- jsonData := entity.JsonData{}
- err := json.Unmarshal([]byte(c.JsonData), &jsonData)
- if err != nil {
- return nil, fmt.Errorf("jsonData convert failed, task %d, cluster %s, datatype: %s", taskName, c.ClusterID, "Code")
- }
- if jsonData.Id == "" {
- continue
- }
-
- cluster, ok := clusterMap[c.ClusterID]
- if ok {
- cluster.CodeId = jsonData.Id
- }
- }
- }
-
- for _, distribute := range distributes.Image {
- if len(distribute.Clusters) == 0 {
- return nil, fmt.Errorf("Image distribute: must specify at least one cluster")
- }
- for _, c := range distribute.Clusters {
- if c.ClusterID == "" {
- return nil, fmt.Errorf("Image distribute: clusterId can not be empty")
- }
-
- jsonData := entity.JsonData{}
- err := json.Unmarshal([]byte(c.JsonData), &jsonData)
- if err != nil {
- return nil, fmt.Errorf("jsonData convert failed, task %d, cluster %s, datatype: %s", taskName, c.ClusterID, "Image")
- }
-
- cluster, ok := clusterMap[c.ClusterID]
- if ok {
- cluster.ImageId = jsonData.Id
- }
- }
- }
-
- for _, c := range clusterMap {
- if c.ModelId == "" {
- return nil, fmt.Errorf("create inference task failed, cluster %s, empty data : %s", c.ClusterId, "ModelId")
- }
-
- if c.CodeId == "" {
- return nil, fmt.Errorf("create inference task failed, cluster %s, empty data : %s", c.ClusterId, "CodeId")
- }
-
- if c.ImageId == "" {
- return nil, fmt.Errorf("create inference task failed, cluster %s, empty data : %s", c.ClusterId, "ImageId")
- }
-
- assignedClusters = append(assignedClusters, c)
- }
-
- if len(assignedClusters) == 0 {
- return nil, fmt.Errorf("no model provided")
- }
-
- return assignedClusters, nil
- }
-
- func generateModelName(clusters []*strategy.AssignedCluster) (string, error) {
- if len(clusters) == 1 {
- return clusters[0].ModelName, nil
- }
-
- var modelName string
- for _, c := range clusters {
- modelName += c.ModelName + ","
- }
-
- modelName = strings.TrimSuffix(modelName, ",")
- return modelName, nil
- }
-
- func (l *CreateInferenceTaskLogic) createDeployInstance(taskId int64, adapterId string, clusterId string, opt *option.InferOption) error {
- cmap, found := l.svcCtx.Scheduler.AiService.InferenceAdapterMap[adapterId]
- if !found {
- return errors.New("adapterId not exist: " + adapterId)
- }
- iCluster, found := cmap[clusterId]
- if !found {
- return errors.New("clusterId not exist: " + clusterId)
- }
- insId, err := iCluster.CreateInferDeployInstance(l.ctx, opt)
- if err != nil {
- return err
- }
-
- aid, err := strconv.ParseInt(adapterId, 10, 64)
- if err != nil {
- return err
- }
- cid, err := strconv.ParseInt(clusterId, 10, 64)
- if err != nil {
- return err
- }
-
- adapterName, err := l.svcCtx.Scheduler.AiStorages.GetAdapterNameById(adapterId)
- if err != nil {
- return err
- }
-
- clusterName, err := l.svcCtx.Scheduler.AiStorages.GetClusterNameById(clusterId)
- if err != nil {
- return err
- }
-
- ins, err := iCluster.GetInferDeployInstance(l.ctx, insId)
- if err != nil {
- return err
- }
-
- _, err = l.svcCtx.Scheduler.AiStorages.SaveInferDeployInstance(taskId, ins.InstanceId, ins.InstanceName, aid, adapterName, cid, clusterName, ins.ModelName, ins.ModelType, ins.InferCard, ins.ClusterType)
- if err != nil {
- return err
- }
-
- return nil
- }
-
- func cloneOption(opt *option.InferOption) (*option.InferOption, error) {
- origJSON, err := json.Marshal(opt)
- if err != nil {
- return nil, err
- }
-
- clone := option.InferOption{}
- if err = json.Unmarshal(origJSON, &clone); err != nil {
- return nil, err
- }
-
- return &clone, nil
- }
|