| @@ -70,7 +70,7 @@ func (l *CommitGeneralTaskLogic) CommitGeneralTask(req *types.GeneralTaskReq) er | |||||
| utils.Convert(&req, &opt) | utils.Convert(&req, &opt) | ||||
| sc, _ := schedulers.NewCloudScheduler(l.ctx, "", l.svcCtx.Scheduler, opt, tx, l.svcCtx.PromClient) | sc, _ := schedulers.NewCloudScheduler(l.ctx, "", l.svcCtx.Scheduler, opt, tx, l.svcCtx.PromClient) | ||||
| results, err := l.svcCtx.Scheduler.AssignAndSchedule(sc, scheduler.JOINT_CLOUD_MODE, nil) | |||||
| results, err := l.svcCtx.Scheduler.AssignAndSchedule(sc, scheduler.SUBMIT_MODE_JOINT_CLOUD, nil) | |||||
| if err != nil { | if err != nil { | ||||
| logx.Errorf("AssignAndSchedule() => execution error: %v", err) | logx.Errorf("AssignAndSchedule() => execution error: %v", err) | ||||
| return err | return err | ||||
| @@ -63,7 +63,7 @@ func (l *CommitVmTaskLogic) CommitVmTask(req *types.CommitVmTaskReq) (resp *type | |||||
| return nil, err | return nil, err | ||||
| } | } | ||||
| // 3、Return scheduling results | // 3、Return scheduling results | ||||
| results, err := l.svcCtx.Scheduler.AssignAndSchedule(vmSchdl, scheduler.JOINT_CLOUD_MODE, nil) | |||||
| results, err := l.svcCtx.Scheduler.AssignAndSchedule(vmSchdl, scheduler.SUBMIT_MODE_JOINT_CLOUD, nil) | |||||
| if err != nil { | if err != nil { | ||||
| logx.Errorf("AssignAndSchedule() => execution error: %v", err) | logx.Errorf("AssignAndSchedule() => execution error: %v", err) | ||||
| return nil, err | return nil, err | ||||
| @@ -24,7 +24,11 @@ func NewScheduleCancelTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext) | |||||
| } | } | ||||
| func (l *ScheduleCancelTaskLogic) ScheduleCancelTask(req *types.CancelTaskReq) (resp *types.CancelTaskResp, err error) { | func (l *ScheduleCancelTaskLogic) ScheduleCancelTask(req *types.CancelTaskReq) (resp *types.CancelTaskResp, err error) { | ||||
| // todo: add your logic here and delete this line | |||||
| // find task | |||||
| _, err = l.svcCtx.Scheduler.AiStorages.GetTaskById(req.TaskId) | |||||
| if err != nil { | |||||
| return nil, err | |||||
| } | |||||
| return | return | ||||
| } | } | ||||
| @@ -6,12 +6,16 @@ import ( | |||||
| "errors" | "errors" | ||||
| "fmt" | "fmt" | ||||
| "github.com/zeromicro/go-zero/core/logx" | "github.com/zeromicro/go-zero/core/logx" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler" | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers" | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option" | "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy" | "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" | "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types" | "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" | "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" | |||||
| "gopkg.in/yaml.v2" | "gopkg.in/yaml.v2" | ||||
| "strings" | |||||
| ) | ) | ||||
| type ScheduleRunTaskLogic struct { | type ScheduleRunTaskLogic struct { | ||||
| @@ -49,8 +53,9 @@ func (l *ScheduleRunTaskLogic) ScheduleRunTask(req *types.RunTaskReq) (resp *typ | |||||
| return nil, err | return nil, err | ||||
| } | } | ||||
| _ = &option.AiOption{ | |||||
| opt := &option.AiOption{ | |||||
| AdapterId: ADAPTERID, | AdapterId: ADAPTERID, | ||||
| TaskName: task.Name, | |||||
| } | } | ||||
| // update assignedClusters | // update assignedClusters | ||||
| err = updateClustersByScheduledDatas(task.Id, &clusters, req.ScheduledDatas) | err = updateClustersByScheduledDatas(task.Id, &clusters, req.ScheduledDatas) | ||||
| @@ -58,35 +63,52 @@ func (l *ScheduleRunTaskLogic) ScheduleRunTask(req *types.RunTaskReq) (resp *typ | |||||
| return nil, err | return nil, err | ||||
| } | } | ||||
| //aiSchdl, err := schedulers.NewAiScheduler(l.ctx, "", l.svcCtx.Scheduler, opt) | |||||
| //if err != nil { | |||||
| // return nil, err | |||||
| //} | |||||
| // | |||||
| //results, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.STORAGE_SCHEDULE_MODE, clusters) | |||||
| //if err != nil { | |||||
| // return nil, err | |||||
| //} | |||||
| //adapterName, err := l.svcCtx.Scheduler.AiStorages.GetAdapterNameById(ADAPTERID) | |||||
| //if err != nil { | |||||
| // return nil, err | |||||
| //} | |||||
| // | |||||
| //for _, i := range clusters { | |||||
| // clusterName, _ := l.svcCtx.Scheduler.AiStorages.GetClusterNameById(i.ClusterID) | |||||
| // | |||||
| // opt := &option.AiOption{} | |||||
| // | |||||
| // err := l.svcCtx.Scheduler.AiStorages.SaveAiTask(task.Id, opt, adapterName, i.ClusterID, clusterName, "", constants.Saved, "") | |||||
| // if err != nil { | |||||
| // return nil, errors.New("database add failed: " + err.Error()) | |||||
| // } | |||||
| //} | |||||
| aiSchdl, err := schedulers.NewAiScheduler(l.ctx, "", l.svcCtx.Scheduler, opt) | |||||
| if err != nil { | |||||
| return nil, err | |||||
| } | |||||
| results, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.SUBMIT_MODE_STORAGE_SCHEDULE, clusters) | |||||
| if err != nil { | |||||
| return nil, err | |||||
| } | |||||
| rs := (results).([]*schedulers.AiResult) | |||||
| err = l.SaveResult(task, rs, opt) | |||||
| if err != nil { | |||||
| return nil, err | |||||
| } | |||||
| return | return | ||||
| } | } | ||||
| func (l *ScheduleRunTaskLogic) SaveResult(task *models.Task, results []*schedulers.AiResult, opt *option.AiOption) error { | |||||
| for _, r := range results { | |||||
| opt.ComputeCard = strings.ToUpper(r.Card) | |||||
| adapterName, err := l.svcCtx.Scheduler.AiStorages.GetAdapterNameById(r.AdapterId) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| clusterName, _ := l.svcCtx.Scheduler.AiStorages.GetClusterNameById(r.ClusterId) | |||||
| err = l.svcCtx.Scheduler.AiStorages.SaveAiTask(task.Id, opt, adapterName, r.ClusterId, clusterName, r.JobId, constants.Saved, r.Msg) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| l.svcCtx.Scheduler.AiStorages.AddNoticeInfo(r.AdapterId, adapterName, r.ClusterId, clusterName, r.TaskName, "create", "任务创建中") | |||||
| } | |||||
| return nil | |||||
| } | |||||
| func updateClustersByScheduledDatas(taskId int64, assignedClusters *[]*strategy.AssignedCluster, scheduledDatas []*types.DataScheduleResults) error { | func updateClustersByScheduledDatas(taskId int64, assignedClusters *[]*strategy.AssignedCluster, scheduledDatas []*types.DataScheduleResults) error { | ||||
| for _, cluster := range *assignedClusters { | for _, cluster := range *assignedClusters { | ||||
| for _, data := range scheduledDatas { | for _, data := range scheduledDatas { | ||||
| @@ -52,7 +52,7 @@ func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *type | |||||
| return nil, err | return nil, err | ||||
| } | } | ||||
| results, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.JOINT_CLOUD_MODE, nil) | |||||
| results, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.SUBMIT_MODE_JOINT_CLOUD, nil) | |||||
| if err != nil { | if err != nil { | ||||
| return nil, err | return nil, err | ||||
| } | } | ||||
| @@ -41,7 +41,7 @@ func (l *AiQueue) Consume(val string) error { | |||||
| aiSchdl, _ := schedulers.NewAiScheduler(l.ctx, val, l.svcCtx.Scheduler, nil) | aiSchdl, _ := schedulers.NewAiScheduler(l.ctx, val, l.svcCtx.Scheduler, nil) | ||||
| // 调度算法 | // 调度算法 | ||||
| _, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.JOINT_CLOUD_MODE, nil) | |||||
| _, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.SUBMIT_MODE_JOINT_CLOUD, nil) | |||||
| if err != nil { | if err != nil { | ||||
| return err | return err | ||||
| } | } | ||||
| @@ -30,8 +30,8 @@ import ( | |||||
| ) | ) | ||||
| const ( | const ( | ||||
| JOINT_CLOUD_MODE = iota + 1 | |||||
| STORAGE_SCHEDULE_MODE | |||||
| SUBMIT_MODE_JOINT_CLOUD = iota + 1 | |||||
| SUBMIT_MODE_STORAGE_SCHEDULE | |||||
| ) | ) | ||||
| type Scheduler struct { | type Scheduler struct { | ||||
| @@ -134,7 +134,7 @@ func (s *Scheduler) TempAssign() error { | |||||
| func (s *Scheduler) AssignAndSchedule(ss SubSchedule, mode int, assignedClusters []*strategy.AssignedCluster) (interface{}, error) { | func (s *Scheduler) AssignAndSchedule(ss SubSchedule, mode int, assignedClusters []*strategy.AssignedCluster) (interface{}, error) { | ||||
| var result interface{} | var result interface{} | ||||
| switch mode { | switch mode { | ||||
| case JOINT_CLOUD_MODE: | |||||
| case SUBMIT_MODE_JOINT_CLOUD: | |||||
| //choose strategy | //choose strategy | ||||
| strategy, err := ss.PickOptimalStrategy() | strategy, err := ss.PickOptimalStrategy() | ||||
| if err != nil { | if err != nil { | ||||
| @@ -155,7 +155,7 @@ func (s *Scheduler) AssignAndSchedule(ss SubSchedule, mode int, assignedClusters | |||||
| result = resp | result = resp | ||||
| case STORAGE_SCHEDULE_MODE: | |||||
| case SUBMIT_MODE_STORAGE_SCHEDULE: | |||||
| //assign tasks to clusters | //assign tasks to clusters | ||||
| resp, err := ss.AssignTask(assignedClusters, mode) | resp, err := ss.AssignTask(assignedClusters, mode) | ||||
| @@ -175,7 +175,7 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster, mode int | |||||
| opt, _ := cloneAiOption(as.option) | opt, _ := cloneAiOption(as.option) | ||||
| // decide opt params by mode | // decide opt params by mode | ||||
| updateAiOptionByMode(c, opt, scheduler.STORAGE_SCHEDULE_MODE) | |||||
| updateAiOptionByMode(c, opt, mode) | |||||
| resp, err := executorMap[c.ClusterId].Execute(as.ctx, opt, mode) | resp, err := executorMap[c.ClusterId].Execute(as.ctx, opt, mode) | ||||
| if err != nil { | if err != nil { | ||||
| @@ -282,7 +282,7 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster, mode int | |||||
| func updateAiOptionByMode(cluster *strategy.AssignedCluster, opt *option.AiOption, mode int) { | func updateAiOptionByMode(cluster *strategy.AssignedCluster, opt *option.AiOption, mode int) { | ||||
| switch mode { | switch mode { | ||||
| case scheduler.STORAGE_SCHEDULE_MODE: | |||||
| case scheduler.SUBMIT_MODE_STORAGE_SCHEDULE: | |||||
| opt.Cmd = cluster.Cmd | opt.Cmd = cluster.Cmd | ||||
| opt.Envs = cluster.Envs | opt.Envs = cluster.Envs | ||||
| opt.Params = cluster.Params | opt.Params = cluster.Params | ||||
| @@ -290,6 +290,8 @@ func updateAiOptionByMode(cluster *strategy.AssignedCluster, opt *option.AiOptio | |||||
| opt.ImageId = cluster.ImageId | opt.ImageId = cluster.ImageId | ||||
| opt.AlgorithmId = cluster.CodeId | opt.AlgorithmId = cluster.CodeId | ||||
| opt.DatasetsId = cluster.DatasetId | opt.DatasetsId = cluster.DatasetId | ||||
| opt.ResourcesRequired = cluster.ResourcesRequired | |||||
| default: | default: | ||||
| } | } | ||||
| @@ -32,6 +32,8 @@ type AiOption struct { | |||||
| AlgorithmCode string | AlgorithmCode string | ||||
| Image string | Image string | ||||
| Model interface{} | Model interface{} | ||||
| ResourcesRequired []map[string]interface{} | |||||
| } | } | ||||
| func (a AiOption) GetOptionType() string { | func (a AiOption) GetOptionType() string { | ||||
| @@ -179,6 +179,7 @@ func (s *ShuguangAi) SubmitPytorchTask(ctx context.Context, imageId string, cmd | |||||
| workPath = ALGORITHM_DIR + FORWARD_SLASH + paths[0] + FORWARD_SLASH + paths[1] + DASH + paths[2] | workPath = ALGORITHM_DIR + FORWARD_SLASH + paths[0] + FORWARD_SLASH + paths[1] + DASH + paths[2] | ||||
| codePath = workPath + FORWARD_SLASH + TRAIN_FILE | codePath = workPath + FORWARD_SLASH + TRAIN_FILE | ||||
| } else { | } else { | ||||
| // storage schedule submit mode | |||||
| codePath = algorithmId | codePath = algorithmId | ||||
| paths = strings.Split(algorithmId, FORWARD_SLASH) | paths = strings.Split(algorithmId, FORWARD_SLASH) | ||||
| last := paths[len(paths)-1] | last := paths[len(paths)-1] | ||||
| @@ -602,10 +603,56 @@ func (s *ShuguangAi) GetTrainingTask(ctx context.Context, taskId string) (*colle | |||||
| } | } | ||||
| func (s *ShuguangAi) Execute(ctx context.Context, option *option.AiOption, mode int) (interface{}, error) { | func (s *ShuguangAi) Execute(ctx context.Context, option *option.AiOption, mode int) (interface{}, error) { | ||||
| err := s.GenerateSubmitParams(ctx, option) | |||||
| if err != nil { | |||||
| return nil, err | |||||
| switch mode { | |||||
| case 1: | |||||
| err := s.GenerateSubmitParams(ctx, option) | |||||
| if err != nil { | |||||
| return nil, err | |||||
| } | |||||
| case 2: | |||||
| var dcuNum int64 | |||||
| for _, res := range option.ResourcesRequired { | |||||
| typeName, ok := res["type"] | |||||
| if !ok { | |||||
| continue | |||||
| } | |||||
| switch typeName { | |||||
| case DCU: | |||||
| num, ok := res["number"] | |||||
| if !ok { | |||||
| continue | |||||
| } | |||||
| n := common.ConvertTypeToString(num) | |||||
| val, err := strconv.ParseInt(n, 10, 64) | |||||
| if err != nil { | |||||
| return nil, err | |||||
| } | |||||
| dcuNum = val | |||||
| } | |||||
| } | |||||
| for k, v := range RESOURCESGAIMAP { | |||||
| if dcuNum == v.GPU { | |||||
| option.ResourceId = k | |||||
| break | |||||
| } | |||||
| if dcuNum == 0 && v.GPU == 1 { | |||||
| option.ResourceId = k | |||||
| break | |||||
| } | |||||
| if dcuNum >= 5 && v.GPU == 5 { | |||||
| option.ResourceId = k | |||||
| break | |||||
| } | |||||
| } | |||||
| option.ComputeCard = DCU | |||||
| default: | |||||
| return nil, errors.New("failed to choose submit mode") | |||||
| } | } | ||||
| task, err := s.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType) | task, err := s.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType) | ||||
| if err != nil { | if err != nil { | ||||
| return nil, err | return nil, err | ||||