|
|
|
@@ -6,6 +6,7 @@ import ( |
|
|
|
"fmt" |
|
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/common" |
|
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option" |
|
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector" |
|
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy" |
|
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/storeLink" |
|
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" |
|
|
|
@@ -14,6 +15,7 @@ import ( |
|
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" |
|
|
|
"slices" |
|
|
|
"strings" |
|
|
|
"time" |
|
|
|
|
|
|
|
"github.com/zeromicro/go-zero/core/logx" |
|
|
|
) |
|
|
|
@@ -21,6 +23,7 @@ import ( |
|
|
|
const ( |
|
|
|
TRAINNING_TASK_REPLICA = 1 |
|
|
|
TRAINNING_TASK_SUFFIX_LEN = 10 |
|
|
|
QUERY_RESOURCE_RETRY = 3 |
|
|
|
) |
|
|
|
|
|
|
|
type ScheduleCreateTaskLogic struct { |
|
|
|
@@ -70,6 +73,10 @@ func (l *ScheduleCreateTaskLogic) ScheduleCreateTask(req *types.CreateTaskReq) ( |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
|
|
|
|
if len(clusterInfos) == 0 { |
|
|
|
return nil, fmt.Errorf("failed to create task, no scheduled cluster found") |
|
|
|
} |
|
|
|
|
|
|
|
for _, info := range clusterInfos { |
|
|
|
clusters = append(clusters, info.ClusterID) |
|
|
|
} |
|
|
|
@@ -89,26 +96,59 @@ func (l *ScheduleCreateTaskLogic) ScheduleCreateTask(req *types.CreateTaskReq) ( |
|
|
|
} |
|
|
|
|
|
|
|
func (l *ScheduleCreateTaskLogic) getClusterInfosByStrategy(resources *types.JobResources) ([]*types.JobClusterInfo, error) { |
|
|
|
cResources, err := l.queryResource.queryResources(make([]string, 0)) |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
|
|
|
|
var resSpecs []*collector.ResourceSpec |
|
|
|
var resCount int |
|
|
|
for i := 0; i < QUERY_RESOURCE_RETRY; i++ { |
|
|
|
defer time.Sleep(time.Second) |
|
|
|
qResources, err := l.queryResource.queryResources(make([]string, 0)) |
|
|
|
if err != nil { |
|
|
|
continue |
|
|
|
} |
|
|
|
|
|
|
|
for _, resource := range qResources { |
|
|
|
if resource.Resources != nil { |
|
|
|
resCount++ |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
if resCount >= 1 { |
|
|
|
resSpecs = qResources |
|
|
|
break |
|
|
|
} else { |
|
|
|
resCount = 0 |
|
|
|
continue |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
if resCount == 0 { |
|
|
|
return nil, fmt.Errorf("failed to create task, resources counting fails") |
|
|
|
} |
|
|
|
|
|
|
|
var clusterInfos []*types.JobClusterInfo |
|
|
|
switch resources.ScheduleStrategy { |
|
|
|
case strategy.LEASTLOADFIRST: |
|
|
|
strtg := strategy.NewLeastLoadFirst(TRAINNING_TASK_REPLICA, cResources) |
|
|
|
strtg := strategy.NewLeastLoadFirst(TRAINNING_TASK_REPLICA, resSpecs) |
|
|
|
clusters, err := strtg.Schedule() |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
clusterInfos = genClusterInfos(clusters, resources.Clusters) |
|
|
|
clusterInfos = filterClusterInfos(clusters, resources.Clusters) |
|
|
|
} |
|
|
|
|
|
|
|
return clusterInfos, nil |
|
|
|
} |
|
|
|
|
|
|
|
func genClusterInfos(clusters []*strategy.AssignedCluster, clusterInfos []*types.JobClusterInfo) []*types.JobClusterInfo { |
|
|
|
return nil |
|
|
|
func filterClusterInfos(clusters []*strategy.AssignedCluster, clusterInfos []*types.JobClusterInfo) []*types.JobClusterInfo { |
|
|
|
var result []*types.JobClusterInfo |
|
|
|
for _, cinfo := range clusterInfos { |
|
|
|
for _, c := range clusters { |
|
|
|
if cinfo.ClusterID == c.ClusterId { |
|
|
|
result = append(result, cinfo) |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
return result |
|
|
|
} |
|
|
|
|
|
|
|
func (l *ScheduleCreateTaskLogic) createTask(taskName string, strategyName string, jobClusterInfo []*types.JobClusterInfo) (int64, error) { |
|
|
|
|