|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414 |
- package schedule
-
- import (
- "context"
- "fmt"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/entity"
- "slices"
- "strings"
- "time"
-
- "github.com/pkg/errors"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/common"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/task"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/storeLink"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
- "gopkg.in/yaml.v3"
-
- "github.com/zeromicro/go-zero/core/logx"
- )
-
- const (
- TRAINNING_TASK_REPLICA = 1
- TRAINNING_TASK_SUFFIX_LEN = 10
- QUERY_RESOURCE_RETRY = 3
- )
-
- type ScheduleCreateTaskLogic struct {
- logx.Logger
- ctx context.Context
- svcCtx *svc.ServiceContext
- queryResource *QueryResourcesLogic
- }
-
- func NewScheduleCreateTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext) *ScheduleCreateTaskLogic {
- return &ScheduleCreateTaskLogic{
- Logger: logx.WithContext(ctx),
- ctx: ctx,
- svcCtx: svcCtx,
- queryResource: NewQueryResourcesLogic(ctx, svcCtx),
- }
- }
-
- func generateFilteredDataDistributes(clusters []*strategy.AssignedCluster, distribute types.DataDistribute) *entity.ClustersWithDataDistributes {
-
- var clusterIds []string
- for _, c := range clusters {
- clusterIds = append(clusterIds, c.ClusterId)
- }
-
- clustersWithDataDistributes := &entity.ClustersWithDataDistributes{
- Clusters: clusters,
- DataDistributes: &types.DataDistribute{
- Dataset: make([]*types.DatasetDistribute, 0),
- Image: make([]*types.ImageDistribute, 0),
- Model: make([]*types.ModelDistribute, 0),
- Code: make([]*types.CodeDistribute, 0),
- },
- }
-
- for _, datasetDistribute := range distribute.Dataset {
- dataset := &types.DatasetDistribute{}
- dataset.DataName = datasetDistribute.DataName
- dataset.PackageID = datasetDistribute.PackageID
- clusterScheduledList := make([]*types.ClusterScheduled, 0)
-
- if len(datasetDistribute.Clusters) != 0 {
- for _, cluster := range datasetDistribute.Clusters {
- if slices.Contains(clusterIds, cluster.ClusterID) {
- clusterScheduledList = append(clusterScheduledList, cluster)
- }
- }
- }
-
- dataset.Clusters = clusterScheduledList
- clustersWithDataDistributes.DataDistributes.Dataset = append(clustersWithDataDistributes.DataDistributes.Dataset, dataset)
- }
-
- for _, imageDistribute := range distribute.Image {
- image := &types.ImageDistribute{}
- image.DataName = imageDistribute.DataName
- image.PackageID = imageDistribute.PackageID
- clusterScheduledList := make([]*types.ClusterScheduled, 0)
-
- if len(imageDistribute.Clusters) != 0 {
- for _, cluster := range imageDistribute.Clusters {
- if slices.Contains(clusterIds, cluster.ClusterID) {
- clusterScheduledList = append(clusterScheduledList, cluster)
- }
- }
- }
-
- image.Clusters = clusterScheduledList
- clustersWithDataDistributes.DataDistributes.Image = append(clustersWithDataDistributes.DataDistributes.Image, image)
- }
-
- for _, codeDistribute := range distribute.Code {
- code := &types.CodeDistribute{}
- code.DataName = codeDistribute.DataName
- code.PackageID = codeDistribute.PackageID
- code.Output = codeDistribute.Output
- clusterScheduledList := make([]*types.ClusterScheduled, 0)
-
- if len(codeDistribute.Clusters) != 0 {
- for _, cluster := range codeDistribute.Clusters {
- if slices.Contains(clusterIds, cluster.ClusterID) {
- clusterScheduledList = append(clusterScheduledList, cluster)
- }
- }
- }
-
- code.Clusters = clusterScheduledList
- clustersWithDataDistributes.DataDistributes.Code = append(clustersWithDataDistributes.DataDistributes.Code, code)
- }
-
- for _, modelDistribute := range distribute.Model {
- model := &types.ModelDistribute{}
- model.DataName = modelDistribute.DataName
- model.PackageID = modelDistribute.PackageID
- clusterScheduledList := make([]*types.ClusterScheduled, 0)
-
- if len(modelDistribute.Clusters) != 0 {
- for _, cluster := range modelDistribute.Clusters {
- if slices.Contains(clusterIds, cluster.ClusterID) {
- clusterScheduledList = append(clusterScheduledList, cluster)
- }
- }
- }
-
- model.Clusters = clusterScheduledList
- clustersWithDataDistributes.DataDistributes.Model = append(clustersWithDataDistributes.DataDistributes.Model, model)
- }
-
- return clustersWithDataDistributes
- }
-
- func (l *ScheduleCreateTaskLogic) ScheduleCreateTask(req *types.CreateTaskReq) (resp *types.CreateTaskResp, err error) {
- resp = &types.CreateTaskResp{}
-
- err = task.ValidateJobResources(req.JobResources, "training")
- if err != nil {
- return nil, err
- }
-
- taskName, err := l.svcCtx.Scheduler.AiService.HandleDuplicateTaskName(req.Name, "training")
- if err != nil {
- return nil, err
- }
-
- var clusters []string
- if len(req.JobResources.Clusters) == 1 {
- clusters = append(clusters, req.JobResources.Clusters[0].ClusterID)
- schedatas, err := l.generateScheduleResult(req.DataDistributes, clusters)
- if err != nil {
- return nil, err
- }
-
- assignedClusters := task.CopyParams([]*strategy.AssignedCluster{{
- ClusterId: req.JobResources.Clusters[0].ClusterID, Replicas: 1,
- }}, req.JobResources.Clusters, "")
-
- // filter data distribution
- clustersWithDataDistributes := generateFilteredDataDistributes(assignedClusters, req.DataDistributes)
-
- taskId, err := l.createTask(taskName, req.Description, req.UserId, req.JobResources.ScheduleStrategy, clustersWithDataDistributes, req.Token, req.UserIp, req.UserName)
- if err != nil {
- return nil, err
- }
- resp.ScheduleDatas = schedatas
- resp.TaskID = taskId
- resp.TaskName = taskName
- return resp, nil
-
- } else {
- assignedClusters, err := l.getAssignedClustersByStrategy(&req.JobResources, &req.DataDistributes)
- if err != nil {
- return nil, err
- }
-
- if len(assignedClusters) == 0 {
- return nil, fmt.Errorf("failed to create task, no scheduled cluster found")
- }
-
- for _, c := range assignedClusters {
- clusters = append(clusters, c.ClusterId)
- }
-
- schedatas, err := l.generateScheduleResult(req.DataDistributes, clusters)
- if err != nil {
- return nil, err
- }
-
- // filter data distribution
- clustersWithDataDistributes := generateFilteredDataDistributes(assignedClusters, req.DataDistributes)
-
- taskId, err := l.createTask(taskName, req.Description, req.UserId, req.JobResources.ScheduleStrategy, clustersWithDataDistributes, req.Token, req.UserIp, req.UserName)
- if err != nil {
- return nil, err
- }
- resp.ScheduleDatas = schedatas
- resp.TaskID = taskId
- resp.TaskName = taskName
-
- return resp, nil
- }
- }
-
- func (l *ScheduleCreateTaskLogic) getAssignedClustersByStrategy(resources *types.JobResources, dataDistribute *types.DataDistribute) ([]*strategy.AssignedCluster, error) {
- var assignedClusters []*strategy.AssignedCluster
- switch resources.ScheduleStrategy {
- case strategy.LEASTLOADFIRST:
- var resSpecs []*collector.ResourceSpec
- var resCount int
- for i := 0; i < QUERY_RESOURCE_RETRY; i++ {
- defer time.Sleep(time.Second)
- qResources, err := l.queryResource.QueryResourcesByClusterId(nil, "Train")
- if err != nil {
- continue
- }
-
- for _, resource := range qResources {
- if resource.Resources != nil {
- resCount++
- }
- }
-
- if resCount >= 1 {
- resSpecs = qResources
- break
- } else {
- resCount = 0
- continue
- }
- }
-
- if resCount == 0 {
- return nil, fmt.Errorf("failed to create task, resources counting fails")
- }
-
- strtg := strategy.NewLeastLoadFirst(TRAINNING_TASK_REPLICA, resSpecs)
- clusters, err := strtg.Schedule()
- if err != nil {
- return nil, err
- }
- assignedClusters = task.CopyParams(clusters, resources.Clusters, "")
- case strategy.DATA_LOCALITY:
- strtg := strategy.NewDataLocality(TRAINNING_TASK_REPLICA, dataDistribute)
- clusters, err := strtg.Schedule()
- if err != nil {
- return nil, err
- }
- assignedClusters = task.CopyParams(clusters, resources.Clusters, "")
- default:
- return nil, errors.New("no strategy has been chosen")
- }
-
- return assignedClusters, nil
- }
-
- func (l *ScheduleCreateTaskLogic) createTask(taskName string, desc string, userId int64, strategyName string, clustersWithDataDistributes *entity.ClustersWithDataDistributes, token string, userIp string, userName string) (int64, error) {
- var synergyStatus int64
- if len(clustersWithDataDistributes.Clusters) > 1 {
- synergyStatus = 1
- }
-
- y, err := yaml.Marshal(clustersWithDataDistributes)
- if err != nil {
- fmt.Printf("Error while Marshaling. %v", err)
- }
-
- taskId, err := l.svcCtx.Scheduler.CreateTask(taskName, desc, userId, synergyStatus, strategyName, string(y), token, userIp, &l.svcCtx.Config, userName)
- if err != nil {
- return 0, err
- }
-
- return taskId, nil
- }
-
- func (l *ScheduleCreateTaskLogic) generateScheduleResult(distribute types.DataDistribute, clusters []string) ([]*types.ScheduleData, error) {
- var schedatas []*types.ScheduleData
-
- for _, d := range distribute.Dataset {
- data := &types.ScheduleData{
- DataType: "dataset",
- PackageID: d.PackageID,
- ClusterIDs: make([]string, 0),
- }
-
- var cSlc []string
- for _, cluster := range d.Clusters {
- cSlc = append(cSlc, cluster.ClusterID)
- }
-
- for _, cluster := range clusters {
- if !slices.Contains(cSlc, cluster) {
- data.ClusterIDs = append(data.ClusterIDs, cluster)
- } else {
- continue
- }
- }
- if len(data.ClusterIDs) != 0 {
- schedatas = append(schedatas, data)
- }
- }
-
- for _, d := range distribute.Code {
- data := &types.ScheduleData{
- DataType: "code",
- PackageID: d.PackageID,
- ClusterIDs: make([]string, 0),
- }
-
- var cSlc []string
- for _, cluster := range d.Clusters {
- cSlc = append(cSlc, cluster.ClusterID)
- }
-
- for _, cluster := range clusters {
- if !slices.Contains(cSlc, cluster) {
- data.ClusterIDs = append(data.ClusterIDs, cluster)
- } else {
- continue
- }
- }
- if len(data.ClusterIDs) != 0 {
- schedatas = append(schedatas, data)
- }
- }
-
- for _, d := range distribute.Image {
- data := &types.ScheduleData{
- DataType: "image",
- PackageID: d.PackageID,
- ClusterIDs: make([]string, 0),
- }
-
- var cSlc []string
- for _, cluster := range d.Clusters {
- cSlc = append(cSlc, cluster.ClusterID)
- }
-
- for _, cluster := range clusters {
- if !slices.Contains(cSlc, cluster) {
- data.ClusterIDs = append(data.ClusterIDs, cluster)
- } else {
- continue
- }
- }
- if len(data.ClusterIDs) != 0 {
- schedatas = append(schedatas, data)
- }
- }
-
- for _, d := range distribute.Model {
- data := &types.ScheduleData{
- DataType: "model",
- PackageID: d.PackageID,
- ClusterIDs: make([]string, 0),
- }
-
- var cSlc []string
- for _, cluster := range d.Clusters {
- cSlc = append(cSlc, cluster.ClusterID)
- }
-
- for _, cluster := range clusters {
- if !slices.Contains(cSlc, cluster) {
- data.ClusterIDs = append(data.ClusterIDs, cluster)
- } else {
- continue
- }
- }
- if len(data.ClusterIDs) != 0 {
- schedatas = append(schedatas, data)
- }
- }
-
- if len(schedatas) != 0 {
- err := l.updateStorageType(&schedatas)
- if err != nil {
- return nil, err
- }
- }
-
- return schedatas, nil
- }
-
- func (l *ScheduleCreateTaskLogic) updateStorageType(schedatas *[]*types.ScheduleData) error {
-
- for _, s := range *schedatas {
- var storageType string
- var sTypes []string
- for _, id := range s.ClusterIDs {
- cluster, err := l.svcCtx.Scheduler.AiStorages.GetClustersById(id)
- if err != nil {
- return err
- }
- stype, ok := storeLink.StorageTypeMap[strings.Title(cluster.Name)]
- if ok {
- sTypes = append(sTypes, stype)
- }
- }
- sTypes = common.Unique(sTypes)
- for _, st := range sTypes {
- storageType += st + storeLink.COMMA
- }
- storageType = strings.TrimSuffix(storageType, storeLink.COMMA)
- s.StorageType = storageType
- }
-
- return nil
- }
|