Former-commit-id: fb05379230
pull/94/head
| @@ -28,11 +28,11 @@ func NewScheduleSubmitLogic(ctx context.Context, svcCtx *svc.ServiceContext) *Sc | |||
| func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *types.ScheduleResp, err error) { | |||
| resp = &types.ScheduleResp{} | |||
| opt := &option.AiOption{ | |||
| ResourceType: req.AiOption.ResourceType, | |||
| Tops: 0, | |||
| TaskType: req.AiOption.TaskType, | |||
| DatasetsName: req.AiOption.Datasets, | |||
| AlgorithmName: "cnn", | |||
| ResourceType: req.AiOption.ResourceType, | |||
| Tops: 0, | |||
| TaskType: req.AiOption.TaskType, | |||
| DatasetsName: req.AiOption.Datasets, | |||
| //AlgorithmName: "cnn", | |||
| StrategyName: req.AiOption.Strategy, | |||
| ClusterToStaticWeight: nil, | |||
| Params: []string{ | |||
| @@ -2,7 +2,6 @@ package database | |||
| import ( | |||
| "github.com/zeromicro/go-zero/core/logx" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/strategy" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" | |||
| @@ -24,12 +23,12 @@ func (s *AiStorage) GetParticipants() (*types.ClusterListResp, error) { | |||
| return &resp, nil | |||
| } | |||
| func (s *AiStorage) SaveTask(cluster strategy.AssignedCluster) error { | |||
| func (s *AiStorage) SaveTask(name string) error { | |||
| // 构建主任务结构体 | |||
| taskModel := models.Task{ | |||
| Status: constants.Saved, | |||
| Description: "ai task", | |||
| Name: "testAi", | |||
| Name: name, | |||
| CommitTime: time.Now(), | |||
| } | |||
| // 保存任务数据到数据库 | |||
| @@ -100,6 +100,8 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) ([]inter | |||
| return nil, errors.New("clusters is nil") | |||
| } | |||
| //res := struct { | |||
| //}{} | |||
| var wg sync.WaitGroup | |||
| var result []interface{} | |||
| var errs []error | |||
| @@ -115,6 +117,7 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) ([]inter | |||
| wg.Add(1) | |||
| go func() { | |||
| resp, err := executorMap[c.Name].Execute(as.ctx, as.option) | |||
| if err != nil { | |||
| // TODO: database operation | |||
| errCh <- err | |||
| @@ -122,15 +125,20 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) ([]inter | |||
| return | |||
| } | |||
| // TODO: database operation | |||
| ch <- resp | |||
| data := struct { | |||
| Resp interface{} | |||
| ClusterId int64 | |||
| }{ | |||
| Resp: resp, | |||
| ClusterId: c.ParticipantId, | |||
| } | |||
| ch <- data | |||
| wg.Done() | |||
| }() | |||
| } | |||
| wg.Wait() | |||
| for s := range ch { | |||
| result = append(result, s) | |||
| } | |||
| close(ch) | |||
| close(errCh) | |||
| for e := range errCh { | |||
| errs = append(errs, e) | |||
| @@ -140,6 +148,19 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) ([]inter | |||
| return nil, errors.New("submit task failed") | |||
| } | |||
| for s := range ch { | |||
| data := (s).(struct { | |||
| Resp interface{} | |||
| ClusterId int64 | |||
| }) | |||
| result = append(result, data.Resp) | |||
| } | |||
| err := as.AiStorages.SaveTask(as.option.TaskName) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| return result, nil | |||
| } | |||
| @@ -49,7 +49,7 @@ func (ps *DynamicResourcesStrategy) Schedule() ([]*AssignedCluster, error) { | |||
| if opt.ResourceType == "computeCard" { | |||
| var maxCurrentCardHours float64 | |||
| for _, card := range res.CardsAvail { | |||
| cardHours := common.RoundFloat(card.TOpsAtFp16*card.CardHours, 3) | |||
| cardHours := common.RoundFloat( /*card.TOpsAtFp16**/ card.CardHours, 3) | |||
| if cardHours > maxCurrentCardHours { | |||
| maxCurrentCardHours = cardHours | |||
| } | |||
| @@ -284,14 +284,14 @@ func (s *ShuguangAi) GetResourceStats(ctx context.Context) (*collector.ResourceS | |||
| totalDcu := limitResp.Data.AccountMaxDcu | |||
| //disk | |||
| diskReq := &hpcAC.ParaStorQuotaReq{} | |||
| diskResp, err := s.aCRpc.ParaStorQuota(ctx, diskReq) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| totalDisk := common.RoundFloat(diskResp.Data[0].Threshold*KB*KB*KB, 3) | |||
| availDisk := common.RoundFloat((diskResp.Data[0].Threshold-diskResp.Data[0].Usage)*KB*KB*KB, 3) | |||
| //diskReq := &hpcAC.ParaStorQuotaReq{} | |||
| //diskResp, err := s.aCRpc.ParaStorQuota(ctx, diskReq) | |||
| //if err != nil { | |||
| // return nil, err | |||
| //} | |||
| // | |||
| //totalDisk := common.RoundFloat(diskResp.Data[0].Threshold*KB*KB*KB, 3) | |||
| //availDisk := common.RoundFloat((diskResp.Data[0].Threshold-diskResp.Data[0].Usage)*KB*KB*KB, 3) | |||
| //memory | |||
| nodeResp, err := s.aCRpc.GetNodeResources(ctx, nil) | |||
| @@ -349,12 +349,12 @@ func (s *ShuguangAi) GetResourceStats(ctx context.Context) (*collector.ResourceS | |||
| Balance: balance, | |||
| CpuCoreTotal: totalCpu, | |||
| CpuCoreAvail: CpuCoreAvail, | |||
| DiskTotal: totalDisk, | |||
| DiskAvail: availDisk, | |||
| MemTotal: memSize, | |||
| MemAvail: MemAvail, | |||
| CpuCoreHours: cpuHours, | |||
| CardsAvail: cards, | |||
| //DiskTotal: totalDisk, | |||
| //DiskAvail: availDisk, | |||
| MemTotal: memSize, | |||
| MemAvail: MemAvail, | |||
| CpuCoreHours: cpuHours, | |||
| CardsAvail: cards, | |||
| } | |||
| return resourceStats, nil | |||
| @@ -381,7 +381,7 @@ func (s *ShuguangAi) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm, | |||
| var algorithms []*collector.Algorithm | |||
| for _, t := range GetTaskTypes() { | |||
| taskType := t | |||
| req := &hpcAC.GetFileListReq{Limit: 100, Path: ALGORITHM_DIR + FORWARD_SLASH + taskType, Start: 0} | |||
| req := &hpcAC.GetFileListReq{Limit: 100, Path: ALGORITHM_DIR + FORWARD_SLASH + taskType, Start: 0, Order: "asc", OrderBy: "name", KeyWord: ""} | |||
| list, err := s.aCRpc.GetFileList(ctx, req) | |||
| if err != nil { | |||
| return nil, err | |||