|
- package schedule
-
- import (
- "context"
- "encoding/json"
- "errors"
- "fmt"
- "strings"
-
- "github.com/zeromicro/go-zero/core/logx"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/entity"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/executor"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
- "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
- "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
- "gopkg.in/yaml.v2"
- )
-
- type ScheduleRunTaskLogic struct {
- logx.Logger
- ctx context.Context
- svcCtx *svc.ServiceContext
- }
-
- func NewScheduleRunTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext) *ScheduleRunTaskLogic {
- return &ScheduleRunTaskLogic{
- Logger: logx.WithContext(ctx),
- ctx: ctx,
- svcCtx: svcCtx,
- }
- }
-
- func (l *ScheduleRunTaskLogic) ScheduleRunTask(req *types.RunTaskReq) (resp *types.RunTaskResp, err error) {
- // find task
- task, err := l.svcCtx.Scheduler.AiStorages.GetTaskById(req.TaskID)
- if err != nil {
- return nil, err
- }
-
- if task == nil {
- return nil, errors.New("task not found ")
- }
-
- if task.Status != constants.Saved {
- switch task.Status {
- case constants.Cancelled:
- return nil, errors.New("task has been cancelled ")
- case constants.Failed:
- return nil, errors.New("task was already failed ")
- case constants.Running:
- return nil, errors.New("task is running ")
- case constants.Succeeded:
- return nil, errors.New("task is completed ")
- default:
- return nil, fmt.Errorf("task is being: %s", task.Status)
- }
- }
-
- var clustersWithDataDistributes entity.ClustersWithDataDistributes
- err = yaml.Unmarshal([]byte(task.YamlString), &clustersWithDataDistributes)
- if err != nil {
- return nil, err
- }
-
- opt := &option.AiOption{
- AdapterId: ADAPTERID,
- TaskName: task.Name,
- TaskId: task.Id,
- StrategyName: "",
- ResourcesRequired: clustersWithDataDistributes.Clusters[0].ResourcesRequired,
- }
-
- // update assignedClusters
- assignedClusters, err := updateClustersByScheduledDatas(task.Id, &clustersWithDataDistributes, req.ScheduledDatas)
- if err != nil {
- return nil, err
- }
-
- aiSchdl, err := schedulers.NewAiScheduler(l.ctx, "", l.svcCtx.Scheduler, opt)
- if err != nil {
- return nil, err
- }
-
- results, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, executor.SUBMIT_MODE_STORAGE_SCHEDULE, assignedClusters)
- if err != nil {
- return nil, err
- }
-
- rs := (results).([]*schedulers.AiResult)
-
- err = l.SaveResult(task, rs, opt)
- if err != nil {
- return nil, err
- }
-
- return
- }
-
- func (l *ScheduleRunTaskLogic) SaveResult(task *models.Task, results []*schedulers.AiResult, opt *option.AiOption) error {
-
- for _, r := range results {
-
- opt.ComputeCard = strings.ToUpper(r.Card)
- opt.Replica = r.Replica
- opt.Output = r.Output
-
- adapterName, err := l.svcCtx.Scheduler.AiStorages.GetAdapterNameById(r.AdapterId)
- if err != nil {
- return err
- }
-
- clusterName, _ := l.svcCtx.Scheduler.AiStorages.GetClusterNameById(r.ClusterId)
-
- err = l.svcCtx.Scheduler.AiStorages.SaveAiTask(task.Id, opt, adapterName, r.ClusterId, clusterName, r.JobId, constants.Saved, r.Msg)
- if err != nil {
- return err
- }
-
- l.svcCtx.Scheduler.AiStorages.AddNoticeInfo(r.AdapterId, adapterName, r.ClusterId, clusterName, r.TaskName, "create", "任务创建中")
-
- }
-
- return nil
-
- }
-
- func updateClustersByScheduledDatas(taskId int64, clustersWithDataDistributes *entity.ClustersWithDataDistributes, scheduledDatas []*types.DataScheduleResults) ([]*strategy.AssignedCluster, error) {
- assignedClusters := make([]*strategy.AssignedCluster, 0)
-
- if len(scheduledDatas) == 0 {
- for _, cluster := range clustersWithDataDistributes.Clusters {
- assignedClusters = append(assignedClusters, cluster)
- }
- } else {
- // handle pass-in scheduledDatas
- for _, cluster := range clustersWithDataDistributes.Clusters {
- for _, data := range scheduledDatas {
- switch data.DataType {
- case "dataset":
- for _, result := range data.Results {
- if !result.Status {
- continue
- }
- for _, c := range result.Clusters {
- if cluster.ClusterId == c.ClusterID {
- if c.JsonData == "" {
- continue
- }
- jsonData := entity.JsonData{}
- err := json.Unmarshal([]byte(c.JsonData), &jsonData)
- if err != nil {
- return nil, fmt.Errorf("pass-in jsonData convert failed, task %d, cluster %s, datatype: %s", taskId, cluster.ClusterId, "dataset")
- }
- cluster.DatasetId = jsonData.Id
- }
- }
- }
- case "image":
- for _, result := range data.Results {
- if !result.Status {
- continue
- }
- for _, c := range result.Clusters {
- if cluster.ClusterId == c.ClusterID {
- if c.JsonData == "" {
- continue
- }
- jsonData := entity.JsonData{}
- err := json.Unmarshal([]byte(c.JsonData), &jsonData)
- if err != nil {
- return nil, fmt.Errorf("pass-in jsonData convert failed, task %d, cluster %s, datatype: %s", taskId, cluster.ClusterId, "image")
- }
- cluster.ImageId = jsonData.Id
- }
- }
- }
- case "code":
- for _, result := range data.Results {
- if !result.Status {
- continue
- }
- for _, c := range result.Clusters {
- if cluster.ClusterId == c.ClusterID {
- if c.JsonData == "" {
- continue
- }
- jsonData := entity.JsonData{}
- err := json.Unmarshal([]byte(c.JsonData), &jsonData)
- if err != nil {
- return nil, fmt.Errorf("pass-in jsonData convert failed, task %d, cluster %s, datatype: %s", taskId, cluster.ClusterId, "code")
- }
- cluster.CodeId = jsonData.Id
- }
- }
- }
- case "model":
- for _, result := range data.Results {
- if !result.Status {
- continue
- }
- for _, c := range result.Clusters {
- if cluster.ClusterId == c.ClusterID {
- if c.JsonData == "" {
- continue
- }
- jsonData := entity.JsonData{}
- err := json.Unmarshal([]byte(c.JsonData), &jsonData)
- if err != nil {
- return nil, fmt.Errorf("pass-in jsonData convert failed, task %d, cluster %s, datatype: %s", taskId, cluster.ClusterId, "model")
- }
- cluster.ModelId = jsonData.Id
- }
- }
- }
- }
- }
- assignedClusters = append(assignedClusters, cluster)
- }
- }
-
- // handle db yaml clustersWithDataDistributes
- for _, cluster := range assignedClusters {
- if cluster.DatasetId == "" {
- for _, distribute := range clustersWithDataDistributes.DataDistributes.Dataset {
- for _, c := range distribute.Clusters {
- if cluster.ClusterId == c.ClusterID {
- if c.JsonData == "" {
- continue
- }
- jsonData := entity.JsonData{}
- err := json.Unmarshal([]byte(c.JsonData), &jsonData)
- if err != nil {
- return nil, fmt.Errorf("db yaml jsonData convert failed, task %d, cluster %s, datatype: %s", taskId, cluster.ClusterId, "dataset")
- }
- cluster.DatasetId = jsonData.Id
- }
- }
- }
- }
-
- if cluster.ImageId == "" {
- for _, distribute := range clustersWithDataDistributes.DataDistributes.Image {
- for _, c := range distribute.Clusters {
- if cluster.ClusterId == c.ClusterID {
- if c.JsonData == "" {
- continue
- }
- jsonData := entity.JsonData{}
- err := json.Unmarshal([]byte(c.JsonData), &jsonData)
- if err != nil {
- return nil, fmt.Errorf("db yaml jsonData convert failed, task %d, cluster %s, datatype: %s", taskId, cluster.ClusterId, "image")
- }
- cluster.ImageId = jsonData.Id
- }
- }
- }
- }
-
- //if cluster.CodeId == "" {
- for _, distribute := range clustersWithDataDistributes.DataDistributes.Code {
- for _, c := range distribute.Clusters {
- if cluster.ClusterId == c.ClusterID {
-
- cluster.Output = distribute.Output
-
- if cluster.CodeId == "" {
- if c.JsonData == "" {
- continue
- }
- jsonData := entity.JsonData{}
- err := json.Unmarshal([]byte(c.JsonData), &jsonData)
- if err != nil {
- return nil, fmt.Errorf("db yaml jsonData convert failed, task %d, cluster %s, datatype: %s", taskId, cluster.ClusterId, "code")
- }
- cluster.CodeId = jsonData.Id
- }
-
- }
- }
- }
-
- if cluster.ModelId == "" {
- for _, distribute := range clustersWithDataDistributes.DataDistributes.Model {
- for _, c := range distribute.Clusters {
- if cluster.ClusterId == c.ClusterID {
- if c.JsonData == "" {
- continue
- }
- jsonData := entity.JsonData{}
- err := json.Unmarshal([]byte(c.JsonData), &jsonData)
- if err != nil {
- return nil, fmt.Errorf("jsonData convert failed, task %d, cluster %s, datatype: %s", taskId, cluster.ClusterId, "model")
- }
- cluster.ModelId = jsonData.Id
- }
- }
- }
- }
- }
-
- // check empty data
- for _, cluster := range assignedClusters {
- if cluster.DatasetId == "" {
- return nil, fmt.Errorf("failed to run task %d, cluster %s cannot find %s", taskId, cluster.ClusterId, "DatasetId")
- }
-
- if cluster.ImageId == "" {
- return nil, fmt.Errorf("failed to run task %d, cluster %s cannot find %s", taskId, cluster.ClusterId, "ImageId")
- }
-
- if cluster.CodeId == "" {
- return nil, fmt.Errorf("failed to run task %d, cluster %s cannot find %s", taskId, cluster.ClusterId, "CodeId")
- }
- }
-
- return assignedClusters, nil
- }
|