| @@ -70,7 +70,7 @@ func (l *CommitGeneralTaskLogic) CommitGeneralTask(req *types.GeneralTaskReq) er | |||
| utils.Convert(&req, &opt) | |||
| sc, _ := schedulers.NewCloudScheduler(l.ctx, "", l.svcCtx.Scheduler, opt, tx, l.svcCtx.PromClient) | |||
| results, err := l.svcCtx.Scheduler.AssignAndSchedule(sc, scheduler.JOINT_CLOUD_MODE, nil) | |||
| results, err := l.svcCtx.Scheduler.AssignAndSchedule(sc, scheduler.SUBMIT_MODE_JOINT_CLOUD, nil) | |||
| if err != nil { | |||
| logx.Errorf("AssignAndSchedule() => execution error: %v", err) | |||
| return err | |||
| @@ -63,7 +63,7 @@ func (l *CommitVmTaskLogic) CommitVmTask(req *types.CommitVmTaskReq) (resp *type | |||
| return nil, err | |||
| } | |||
| // 3、Return scheduling results | |||
| results, err := l.svcCtx.Scheduler.AssignAndSchedule(vmSchdl, scheduler.JOINT_CLOUD_MODE, nil) | |||
| results, err := l.svcCtx.Scheduler.AssignAndSchedule(vmSchdl, scheduler.SUBMIT_MODE_JOINT_CLOUD, nil) | |||
| if err != nil { | |||
| logx.Errorf("AssignAndSchedule() => execution error: %v", err) | |||
| return nil, err | |||
| @@ -24,7 +24,11 @@ func NewScheduleCancelTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext) | |||
| } | |||
| func (l *ScheduleCancelTaskLogic) ScheduleCancelTask(req *types.CancelTaskReq) (resp *types.CancelTaskResp, err error) { | |||
| // todo: add your logic here and delete this line | |||
| // find task | |||
| _, err = l.svcCtx.Scheduler.AiStorages.GetTaskById(req.TaskId) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| return | |||
| } | |||
| @@ -6,12 +6,16 @@ import ( | |||
| "errors" | |||
| "fmt" | |||
| "github.com/zeromicro/go-zero/core/logx" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" | |||
| "gopkg.in/yaml.v2" | |||
| "strings" | |||
| ) | |||
| type ScheduleRunTaskLogic struct { | |||
| @@ -49,8 +53,9 @@ func (l *ScheduleRunTaskLogic) ScheduleRunTask(req *types.RunTaskReq) (resp *typ | |||
| return nil, err | |||
| } | |||
| _ = &option.AiOption{ | |||
| opt := &option.AiOption{ | |||
| AdapterId: ADAPTERID, | |||
| TaskName: task.Name, | |||
| } | |||
| // update assignedClusters | |||
| err = updateClustersByScheduledDatas(task.Id, &clusters, req.ScheduledDatas) | |||
| @@ -58,35 +63,52 @@ func (l *ScheduleRunTaskLogic) ScheduleRunTask(req *types.RunTaskReq) (resp *typ | |||
| return nil, err | |||
| } | |||
| //aiSchdl, err := schedulers.NewAiScheduler(l.ctx, "", l.svcCtx.Scheduler, opt) | |||
| //if err != nil { | |||
| // return nil, err | |||
| //} | |||
| // | |||
| //results, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.STORAGE_SCHEDULE_MODE, clusters) | |||
| //if err != nil { | |||
| // return nil, err | |||
| //} | |||
| //adapterName, err := l.svcCtx.Scheduler.AiStorages.GetAdapterNameById(ADAPTERID) | |||
| //if err != nil { | |||
| // return nil, err | |||
| //} | |||
| // | |||
| //for _, i := range clusters { | |||
| // clusterName, _ := l.svcCtx.Scheduler.AiStorages.GetClusterNameById(i.ClusterID) | |||
| // | |||
| // opt := &option.AiOption{} | |||
| // | |||
| // err := l.svcCtx.Scheduler.AiStorages.SaveAiTask(task.Id, opt, adapterName, i.ClusterID, clusterName, "", constants.Saved, "") | |||
| // if err != nil { | |||
| // return nil, errors.New("database add failed: " + err.Error()) | |||
| // } | |||
| //} | |||
| aiSchdl, err := schedulers.NewAiScheduler(l.ctx, "", l.svcCtx.Scheduler, opt) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| results, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.SUBMIT_MODE_STORAGE_SCHEDULE, clusters) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| rs := (results).([]*schedulers.AiResult) | |||
| err = l.SaveResult(task, rs, opt) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| return | |||
| } | |||
| func (l *ScheduleRunTaskLogic) SaveResult(task *models.Task, results []*schedulers.AiResult, opt *option.AiOption) error { | |||
| for _, r := range results { | |||
| opt.ComputeCard = strings.ToUpper(r.Card) | |||
| adapterName, err := l.svcCtx.Scheduler.AiStorages.GetAdapterNameById(r.AdapterId) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| clusterName, _ := l.svcCtx.Scheduler.AiStorages.GetClusterNameById(r.ClusterId) | |||
| err = l.svcCtx.Scheduler.AiStorages.SaveAiTask(task.Id, opt, adapterName, r.ClusterId, clusterName, r.JobId, constants.Saved, r.Msg) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| l.svcCtx.Scheduler.AiStorages.AddNoticeInfo(r.AdapterId, adapterName, r.ClusterId, clusterName, r.TaskName, "create", "任务创建中") | |||
| } | |||
| return nil | |||
| } | |||
| func updateClustersByScheduledDatas(taskId int64, assignedClusters *[]*strategy.AssignedCluster, scheduledDatas []*types.DataScheduleResults) error { | |||
| for _, cluster := range *assignedClusters { | |||
| for _, data := range scheduledDatas { | |||
| @@ -52,7 +52,7 @@ func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *type | |||
| return nil, err | |||
| } | |||
| results, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.JOINT_CLOUD_MODE, nil) | |||
| results, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.SUBMIT_MODE_JOINT_CLOUD, nil) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| @@ -41,7 +41,7 @@ func (l *AiQueue) Consume(val string) error { | |||
| aiSchdl, _ := schedulers.NewAiScheduler(l.ctx, val, l.svcCtx.Scheduler, nil) | |||
| // 调度算法 | |||
| _, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.JOINT_CLOUD_MODE, nil) | |||
| _, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.SUBMIT_MODE_JOINT_CLOUD, nil) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| @@ -30,8 +30,8 @@ import ( | |||
| ) | |||
| const ( | |||
| JOINT_CLOUD_MODE = iota + 1 | |||
| STORAGE_SCHEDULE_MODE | |||
| SUBMIT_MODE_JOINT_CLOUD = iota + 1 | |||
| SUBMIT_MODE_STORAGE_SCHEDULE | |||
| ) | |||
| type Scheduler struct { | |||
| @@ -134,7 +134,7 @@ func (s *Scheduler) TempAssign() error { | |||
| func (s *Scheduler) AssignAndSchedule(ss SubSchedule, mode int, assignedClusters []*strategy.AssignedCluster) (interface{}, error) { | |||
| var result interface{} | |||
| switch mode { | |||
| case JOINT_CLOUD_MODE: | |||
| case SUBMIT_MODE_JOINT_CLOUD: | |||
| //choose strategy | |||
| strategy, err := ss.PickOptimalStrategy() | |||
| if err != nil { | |||
| @@ -155,7 +155,7 @@ func (s *Scheduler) AssignAndSchedule(ss SubSchedule, mode int, assignedClusters | |||
| result = resp | |||
| case STORAGE_SCHEDULE_MODE: | |||
| case SUBMIT_MODE_STORAGE_SCHEDULE: | |||
| //assign tasks to clusters | |||
| resp, err := ss.AssignTask(assignedClusters, mode) | |||
| @@ -175,7 +175,7 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster, mode int | |||
| opt, _ := cloneAiOption(as.option) | |||
| // decide opt params by mode | |||
| updateAiOptionByMode(c, opt, scheduler.STORAGE_SCHEDULE_MODE) | |||
| updateAiOptionByMode(c, opt, mode) | |||
| resp, err := executorMap[c.ClusterId].Execute(as.ctx, opt, mode) | |||
| if err != nil { | |||
| @@ -282,7 +282,7 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster, mode int | |||
| func updateAiOptionByMode(cluster *strategy.AssignedCluster, opt *option.AiOption, mode int) { | |||
| switch mode { | |||
| case scheduler.STORAGE_SCHEDULE_MODE: | |||
| case scheduler.SUBMIT_MODE_STORAGE_SCHEDULE: | |||
| opt.Cmd = cluster.Cmd | |||
| opt.Envs = cluster.Envs | |||
| opt.Params = cluster.Params | |||
| @@ -290,6 +290,8 @@ func updateAiOptionByMode(cluster *strategy.AssignedCluster, opt *option.AiOptio | |||
| opt.ImageId = cluster.ImageId | |||
| opt.AlgorithmId = cluster.CodeId | |||
| opt.DatasetsId = cluster.DatasetId | |||
| opt.ResourcesRequired = cluster.ResourcesRequired | |||
| default: | |||
| } | |||
| @@ -32,6 +32,8 @@ type AiOption struct { | |||
| AlgorithmCode string | |||
| Image string | |||
| Model interface{} | |||
| ResourcesRequired []map[string]interface{} | |||
| } | |||
| func (a AiOption) GetOptionType() string { | |||
| @@ -179,6 +179,7 @@ func (s *ShuguangAi) SubmitPytorchTask(ctx context.Context, imageId string, cmd | |||
| workPath = ALGORITHM_DIR + FORWARD_SLASH + paths[0] + FORWARD_SLASH + paths[1] + DASH + paths[2] | |||
| codePath = workPath + FORWARD_SLASH + TRAIN_FILE | |||
| } else { | |||
| // storage schedule submit mode | |||
| codePath = algorithmId | |||
| paths = strings.Split(algorithmId, FORWARD_SLASH) | |||
| last := paths[len(paths)-1] | |||
| @@ -602,10 +603,56 @@ func (s *ShuguangAi) GetTrainingTask(ctx context.Context, taskId string) (*colle | |||
| } | |||
| func (s *ShuguangAi) Execute(ctx context.Context, option *option.AiOption, mode int) (interface{}, error) { | |||
| err := s.GenerateSubmitParams(ctx, option) | |||
| if err != nil { | |||
| return nil, err | |||
| switch mode { | |||
| case 1: | |||
| err := s.GenerateSubmitParams(ctx, option) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| case 2: | |||
| var dcuNum int64 | |||
| for _, res := range option.ResourcesRequired { | |||
| typeName, ok := res["type"] | |||
| if !ok { | |||
| continue | |||
| } | |||
| switch typeName { | |||
| case DCU: | |||
| num, ok := res["number"] | |||
| if !ok { | |||
| continue | |||
| } | |||
| n := common.ConvertTypeToString(num) | |||
| val, err := strconv.ParseInt(n, 10, 64) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| dcuNum = val | |||
| } | |||
| } | |||
| for k, v := range RESOURCESGAIMAP { | |||
| if dcuNum == v.GPU { | |||
| option.ResourceId = k | |||
| break | |||
| } | |||
| if dcuNum == 0 && v.GPU == 1 { | |||
| option.ResourceId = k | |||
| break | |||
| } | |||
| if dcuNum >= 5 && v.GPU == 5 { | |||
| option.ResourceId = k | |||
| break | |||
| } | |||
| } | |||
| option.ComputeCard = DCU | |||
| default: | |||
| return nil, errors.New("failed to choose submit mode") | |||
| } | |||
| task, err := s.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType) | |||
| if err != nil { | |||
| return nil, err | |||