Browse Source

updated runtasklogic

pull/376/head
tzwang 10 months ago
parent
commit
3cf1521eca
10 changed files with 117 additions and 40 deletions
  1. +1
    -1
      internal/logic/cloud/commitgeneraltasklogic.go
  2. +1
    -1
      internal/logic/core/commitvmtasklogic.go
  3. +5
    -1
      internal/logic/schedule/schedulecanceltasklogic.go
  4. +48
    -26
      internal/logic/schedule/scheduleruntasklogic.go
  5. +1
    -1
      internal/logic/schedule/schedulesubmitlogic.go
  6. +1
    -1
      internal/mqs/ScheduleAi.go
  7. +4
    -4
      internal/scheduler/scheduler.go
  8. +4
    -2
      internal/scheduler/schedulers/aiScheduler.go
  9. +2
    -0
      internal/scheduler/schedulers/option/aiOption.go
  10. +50
    -3
      internal/storeLink/shuguangai.go

+ 1
- 1
internal/logic/cloud/commitgeneraltasklogic.go View File

@@ -70,7 +70,7 @@ func (l *CommitGeneralTaskLogic) CommitGeneralTask(req *types.GeneralTaskReq) er
utils.Convert(&req, &opt)
sc, _ := schedulers.NewCloudScheduler(l.ctx, "", l.svcCtx.Scheduler, opt, tx, l.svcCtx.PromClient)

results, err := l.svcCtx.Scheduler.AssignAndSchedule(sc, scheduler.JOINT_CLOUD_MODE, nil)
results, err := l.svcCtx.Scheduler.AssignAndSchedule(sc, scheduler.SUBMIT_MODE_JOINT_CLOUD, nil)
if err != nil {
logx.Errorf("AssignAndSchedule() => execution error: %v", err)
return err


+ 1
- 1
internal/logic/core/commitvmtasklogic.go View File

@@ -63,7 +63,7 @@ func (l *CommitVmTaskLogic) CommitVmTask(req *types.CommitVmTaskReq) (resp *type
return nil, err
}
// 3、Return scheduling results
results, err := l.svcCtx.Scheduler.AssignAndSchedule(vmSchdl, scheduler.JOINT_CLOUD_MODE, nil)
results, err := l.svcCtx.Scheduler.AssignAndSchedule(vmSchdl, scheduler.SUBMIT_MODE_JOINT_CLOUD, nil)
if err != nil {
logx.Errorf("AssignAndSchedule() => execution error: %v", err)
return nil, err


+ 5
- 1
internal/logic/schedule/schedulecanceltasklogic.go View File

@@ -24,7 +24,11 @@ func NewScheduleCancelTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext)
}

func (l *ScheduleCancelTaskLogic) ScheduleCancelTask(req *types.CancelTaskReq) (resp *types.CancelTaskResp, err error) {
// todo: add your logic here and delete this line
// find task
_, err = l.svcCtx.Scheduler.AiStorages.GetTaskById(req.TaskId)
if err != nil {
return nil, err
}

return
}

+ 48
- 26
internal/logic/schedule/scheduleruntasklogic.go View File

@@ -6,12 +6,16 @@ import (
"errors"
"fmt"
"github.com/zeromicro/go-zero/core/logx"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"gopkg.in/yaml.v2"
"strings"
)

type ScheduleRunTaskLogic struct {
@@ -49,8 +53,9 @@ func (l *ScheduleRunTaskLogic) ScheduleRunTask(req *types.RunTaskReq) (resp *typ
return nil, err
}

_ = &option.AiOption{
opt := &option.AiOption{
AdapterId: ADAPTERID,
TaskName: task.Name,
}
// update assignedClusters
err = updateClustersByScheduledDatas(task.Id, &clusters, req.ScheduledDatas)
@@ -58,35 +63,52 @@ func (l *ScheduleRunTaskLogic) ScheduleRunTask(req *types.RunTaskReq) (resp *typ
return nil, err
}

//aiSchdl, err := schedulers.NewAiScheduler(l.ctx, "", l.svcCtx.Scheduler, opt)
//if err != nil {
// return nil, err
//}
//
//results, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.STORAGE_SCHEDULE_MODE, clusters)
//if err != nil {
// return nil, err
//}

//adapterName, err := l.svcCtx.Scheduler.AiStorages.GetAdapterNameById(ADAPTERID)
//if err != nil {
// return nil, err
//}
//
//for _, i := range clusters {
// clusterName, _ := l.svcCtx.Scheduler.AiStorages.GetClusterNameById(i.ClusterID)
//
// opt := &option.AiOption{}
//
// err := l.svcCtx.Scheduler.AiStorages.SaveAiTask(task.Id, opt, adapterName, i.ClusterID, clusterName, "", constants.Saved, "")
// if err != nil {
// return nil, errors.New("database add failed: " + err.Error())
// }
//}
aiSchdl, err := schedulers.NewAiScheduler(l.ctx, "", l.svcCtx.Scheduler, opt)
if err != nil {
return nil, err
}

results, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.SUBMIT_MODE_STORAGE_SCHEDULE, clusters)
if err != nil {
return nil, err
}

rs := (results).([]*schedulers.AiResult)

err = l.SaveResult(task, rs, opt)
if err != nil {
return nil, err
}

return
}

func (l *ScheduleRunTaskLogic) SaveResult(task *models.Task, results []*schedulers.AiResult, opt *option.AiOption) error {

for _, r := range results {

opt.ComputeCard = strings.ToUpper(r.Card)

adapterName, err := l.svcCtx.Scheduler.AiStorages.GetAdapterNameById(r.AdapterId)
if err != nil {
return err
}

clusterName, _ := l.svcCtx.Scheduler.AiStorages.GetClusterNameById(r.ClusterId)

err = l.svcCtx.Scheduler.AiStorages.SaveAiTask(task.Id, opt, adapterName, r.ClusterId, clusterName, r.JobId, constants.Saved, r.Msg)
if err != nil {
return err
}

l.svcCtx.Scheduler.AiStorages.AddNoticeInfo(r.AdapterId, adapterName, r.ClusterId, clusterName, r.TaskName, "create", "任务创建中")

}

return nil

}

func updateClustersByScheduledDatas(taskId int64, assignedClusters *[]*strategy.AssignedCluster, scheduledDatas []*types.DataScheduleResults) error {
for _, cluster := range *assignedClusters {
for _, data := range scheduledDatas {


+ 1
- 1
internal/logic/schedule/schedulesubmitlogic.go View File

@@ -52,7 +52,7 @@ func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *type
return nil, err
}

results, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.JOINT_CLOUD_MODE, nil)
results, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.SUBMIT_MODE_JOINT_CLOUD, nil)
if err != nil {
return nil, err
}


+ 1
- 1
internal/mqs/ScheduleAi.go View File

@@ -41,7 +41,7 @@ func (l *AiQueue) Consume(val string) error {
aiSchdl, _ := schedulers.NewAiScheduler(l.ctx, val, l.svcCtx.Scheduler, nil)

// 调度算法
_, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.JOINT_CLOUD_MODE, nil)
_, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.SUBMIT_MODE_JOINT_CLOUD, nil)
if err != nil {
return err
}


+ 4
- 4
internal/scheduler/scheduler.go View File

@@ -30,8 +30,8 @@ import (
)

const (
JOINT_CLOUD_MODE = iota + 1
STORAGE_SCHEDULE_MODE
SUBMIT_MODE_JOINT_CLOUD = iota + 1
SUBMIT_MODE_STORAGE_SCHEDULE
)

type Scheduler struct {
@@ -134,7 +134,7 @@ func (s *Scheduler) TempAssign() error {
func (s *Scheduler) AssignAndSchedule(ss SubSchedule, mode int, assignedClusters []*strategy.AssignedCluster) (interface{}, error) {
var result interface{}
switch mode {
case JOINT_CLOUD_MODE:
case SUBMIT_MODE_JOINT_CLOUD:
//choose strategy
strategy, err := ss.PickOptimalStrategy()
if err != nil {
@@ -155,7 +155,7 @@ func (s *Scheduler) AssignAndSchedule(ss SubSchedule, mode int, assignedClusters

result = resp

case STORAGE_SCHEDULE_MODE:
case SUBMIT_MODE_STORAGE_SCHEDULE:

//assign tasks to clusters
resp, err := ss.AssignTask(assignedClusters, mode)


+ 4
- 2
internal/scheduler/schedulers/aiScheduler.go View File

@@ -175,7 +175,7 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster, mode int
opt, _ := cloneAiOption(as.option)

// decide opt params by mode
updateAiOptionByMode(c, opt, scheduler.STORAGE_SCHEDULE_MODE)
updateAiOptionByMode(c, opt, mode)

resp, err := executorMap[c.ClusterId].Execute(as.ctx, opt, mode)
if err != nil {
@@ -282,7 +282,7 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster, mode int

func updateAiOptionByMode(cluster *strategy.AssignedCluster, opt *option.AiOption, mode int) {
switch mode {
case scheduler.STORAGE_SCHEDULE_MODE:
case scheduler.SUBMIT_MODE_STORAGE_SCHEDULE:
opt.Cmd = cluster.Cmd
opt.Envs = cluster.Envs
opt.Params = cluster.Params
@@ -290,6 +290,8 @@ func updateAiOptionByMode(cluster *strategy.AssignedCluster, opt *option.AiOptio
opt.ImageId = cluster.ImageId
opt.AlgorithmId = cluster.CodeId
opt.DatasetsId = cluster.DatasetId

opt.ResourcesRequired = cluster.ResourcesRequired
default:

}


+ 2
- 0
internal/scheduler/schedulers/option/aiOption.go View File

@@ -32,6 +32,8 @@ type AiOption struct {
AlgorithmCode string
Image string
Model interface{}

ResourcesRequired []map[string]interface{}
}

func (a AiOption) GetOptionType() string {


+ 50
- 3
internal/storeLink/shuguangai.go View File

@@ -179,6 +179,7 @@ func (s *ShuguangAi) SubmitPytorchTask(ctx context.Context, imageId string, cmd
workPath = ALGORITHM_DIR + FORWARD_SLASH + paths[0] + FORWARD_SLASH + paths[1] + DASH + paths[2]
codePath = workPath + FORWARD_SLASH + TRAIN_FILE
} else {
// storage schedule submit mode
codePath = algorithmId
paths = strings.Split(algorithmId, FORWARD_SLASH)
last := paths[len(paths)-1]
@@ -602,10 +603,56 @@ func (s *ShuguangAi) GetTrainingTask(ctx context.Context, taskId string) (*colle
}

func (s *ShuguangAi) Execute(ctx context.Context, option *option.AiOption, mode int) (interface{}, error) {
err := s.GenerateSubmitParams(ctx, option)
if err != nil {
return nil, err
switch mode {
case 1:
err := s.GenerateSubmitParams(ctx, option)
if err != nil {
return nil, err
}
case 2:
var dcuNum int64
for _, res := range option.ResourcesRequired {
typeName, ok := res["type"]
if !ok {
continue
}
switch typeName {
case DCU:
num, ok := res["number"]
if !ok {
continue
}
n := common.ConvertTypeToString(num)
val, err := strconv.ParseInt(n, 10, 64)
if err != nil {
return nil, err
}
dcuNum = val
}
}
for k, v := range RESOURCESGAIMAP {
if dcuNum == v.GPU {
option.ResourceId = k
break
}

if dcuNum == 0 && v.GPU == 1 {
option.ResourceId = k
break
}

if dcuNum >= 5 && v.GPU == 5 {
option.ResourceId = k
break
}
}

option.ComputeCard = DCU

default:
return nil, errors.New("failed to choose submit mode")
}

task, err := s.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType)
if err != nil {
return nil, err


Loading…
Cancel
Save