# Conflicts:
# api/internal/logic/schedule/schedulesubmitlogic.go
# api/internal/scheduler/database/aiStorage.go
Former-commit-id: dd876d4244
pull/178/head
| @@ -1733,6 +1733,8 @@ PayloadCreateTrainJob{ | |||
| AiTask { | |||
| Name string `json:"name,optional"` | |||
| status string `json:"status,optional"` | |||
| Cluster string `json:"cluster,optional"` | |||
| Card string `json:"card,optional"` | |||
| TimeElapsed int32 `json:"elapsed,optional"` | |||
| } | |||
| ) | |||
| @@ -14,7 +14,7 @@ type ( | |||
| Description string `json:"description,optional"` | |||
| TenantId int64 `json:"tenantId,optional"` | |||
| TaskId int64 `json:"taskId,optional"` | |||
| AdapterIds []string `json:"adapterId"` | |||
| AdapterIds []string `json:"adapterIds"` | |||
| MatchLabels map[string]string `json:"matchLabels,optional"` | |||
| CardCount int64 `json:"cardCount,optional"` | |||
| WorkDir string `json:"workDir,optional"` //paratera:workingDir | |||
| @@ -19,7 +19,9 @@ type ( | |||
| ScheduleResult { | |||
| ClusterId string `json:"clusterId"` | |||
| TaskId string `json:"taskId"` | |||
| Card string `json:"card"` | |||
| Strategy string `json:"strategy"` | |||
| JobId string `json:"jobId"` | |||
| Replica int32 `json:"replica"` | |||
| Msg string `json:"msg"` | |||
| } | |||
| @@ -32,6 +34,7 @@ type ( | |||
| AdapterId string `json:"adapterId"` | |||
| AiClusterIds []string `json:"aiClusterIds"` | |||
| ResourceType string `json:"resourceType"` | |||
| ComputeCard string `json:"card"` | |||
| Tops float64 `json:"Tops,optional"` | |||
| TaskType string `json:"taskType"` | |||
| Datasets string `json:"datasets"` | |||
| @@ -2,6 +2,8 @@ package ai | |||
| import ( | |||
| "context" | |||
| "errors" | |||
| "fmt" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" | |||
| "strconv" | |||
| "sync" | |||
| @@ -46,6 +48,9 @@ func (l *GetCenterTaskListLogic) GetCenterTaskList() (resp *types.CenterTaskList | |||
| if err != nil { | |||
| continue | |||
| } | |||
| if len(taskList) == 0 { | |||
| continue | |||
| } | |||
| for _, task := range taskList { | |||
| var elapsed time.Duration | |||
| switch task.Status { | |||
| @@ -68,6 +73,8 @@ func (l *GetCenterTaskListLogic) GetCenterTaskList() (resp *types.CenterTaskList | |||
| t := &types.AiTask{ | |||
| Name: task.Name, | |||
| Status: task.Status, | |||
| Cluster: task.ClusterName, | |||
| Card: task.Card, | |||
| TimeElapsed: int32(elapsed.Seconds()), | |||
| } | |||
| resp.List = append(resp.List, t) | |||
| @@ -80,7 +87,6 @@ func (l *GetCenterTaskListLogic) GetCenterTaskList() (resp *types.CenterTaskList | |||
| case <-time.After(2 * time.Second): | |||
| return resp, nil | |||
| } | |||
| } | |||
| func (l *GetCenterTaskListLogic) updateAiTaskStatus(mu *sync.RWMutex, ch chan<- struct{}, list []*types.AdapterInfo) { | |||
| @@ -90,15 +96,20 @@ func (l *GetCenterTaskListLogic) updateAiTaskStatus(mu *sync.RWMutex, ch chan<- | |||
| if err != nil { | |||
| continue | |||
| } | |||
| if len(taskList) == 0 { | |||
| continue | |||
| } | |||
| for _, task := range taskList { | |||
| t := task | |||
| if t.Status == constants.Completed || t.JobId == "" { | |||
| if t.Status == constants.Completed || task.Status == constants.Failed { | |||
| continue | |||
| } | |||
| wg.Add(1) | |||
| go func() { | |||
| trainingTask, err := l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[adapter.Id][strconv.FormatInt(t.ClusterId, 10)].GetTrainingTask(l.ctx, t.JobId) | |||
| if err != nil { | |||
| msg := fmt.Sprintf("AiTaskId: %v, clusterId: %v , JobId: %v, error: %v \n", t.Id, t.ClusterId, t.JobId, err.Error()) | |||
| logx.Errorf(errors.New(msg).Error()) | |||
| wg.Done() | |||
| return | |||
| } | |||
| @@ -2,12 +2,16 @@ package core | |||
| import ( | |||
| "context" | |||
| "errors" | |||
| "fmt" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils/timeutils" | |||
| "strconv" | |||
| "sync" | |||
| "time" | |||
| "github.com/zeromicro/go-zero/core/logx" | |||
| @@ -53,8 +57,9 @@ func (l *PageListTaskLogic) PageListTask(req *types.PageTaskReq) (resp *types.Pa | |||
| } | |||
| // 更新智算任务状态 | |||
| var ch = make(chan struct{}) | |||
| go l.updateAitaskStatus(list, ch) | |||
| chs := [2]chan struct{}{make(chan struct{}), make(chan struct{})} | |||
| go l.updateTaskStatus(list, chs[0]) | |||
| go l.updateAiTaskStatus(list, chs[1]) | |||
| for _, model := range list { | |||
| if model.StartTime != "" && model.EndTime == "" { | |||
| @@ -72,20 +77,22 @@ func (l *PageListTaskLogic) PageListTask(req *types.PageTaskReq) (resp *types.Pa | |||
| resp.PageNum = req.PageNum | |||
| resp.Total = total | |||
| select { | |||
| case _ = <-ch: | |||
| return resp, nil | |||
| case <-time.After(1 * time.Second): | |||
| return resp, nil | |||
| for _, ch := range chs { | |||
| select { | |||
| case <-ch: | |||
| case <-time.After(2 * time.Second): | |||
| return | |||
| } | |||
| } | |||
| return | |||
| } | |||
| func (l *PageListTaskLogic) updateAitaskStatus(tasks []*types.TaskModel, ch chan<- struct{}) { | |||
| func (l *PageListTaskLogic) updateTaskStatus(tasks []*types.TaskModel, ch chan<- struct{}) { | |||
| for _, task := range tasks { | |||
| if task.AdapterTypeDict != 1 { | |||
| continue | |||
| } | |||
| if task.Status == constants.Succeeded { | |||
| if task.Status == constants.Succeeded || task.Status == constants.Failed { | |||
| continue | |||
| } | |||
| @@ -96,9 +103,15 @@ func (l *PageListTaskLogic) updateAitaskStatus(tasks []*types.TaskModel, ch chan | |||
| return | |||
| } | |||
| if len(aiTask) == 0 { | |||
| continue | |||
| } | |||
| start, _ := time.ParseInLocation(constants.Layout, aiTask[0].StartTime, time.Local) | |||
| end, _ := time.ParseInLocation(constants.Layout, aiTask[0].EndTime, time.Local) | |||
| var status = constants.Succeeded | |||
| var status string | |||
| var count int | |||
| for _, a := range aiTask { | |||
| s, _ := time.ParseInLocation(constants.Layout, a.StartTime, time.Local) | |||
| e, _ := time.ParseInLocation(constants.Layout, a.EndTime, time.Local) | |||
| @@ -116,20 +129,90 @@ func (l *PageListTaskLogic) updateAitaskStatus(tasks []*types.TaskModel, ch chan | |||
| break | |||
| } | |||
| if a.Status == constants.Pending { | |||
| status = a.Status | |||
| continue | |||
| } | |||
| if a.Status == constants.Running { | |||
| status = a.Status | |||
| continue | |||
| } | |||
| if a.Status == constants.Completed { | |||
| count++ | |||
| continue | |||
| } | |||
| } | |||
| if count == len(aiTask) { | |||
| status = constants.Succeeded | |||
| } | |||
| task.Status = status | |||
| task.StartTime = start.Format(constants.Layout) | |||
| task.EndTime = end.Format(constants.Layout) | |||
| if status != "" { | |||
| task.Status = status | |||
| task.StartTime = start.Format(constants.Layout) | |||
| task.EndTime = end.Format(constants.Layout) | |||
| } | |||
| tx = l.svcCtx.DbEngin.Table("task").Updates(task) | |||
| if tx.Error != nil { | |||
| logx.Errorf(tx.Error.Error()) | |||
| return | |||
| } | |||
| } | |||
| ch <- struct{}{} | |||
| } | |||
| func (l *PageListTaskLogic) updateAiTaskStatus(tasks []*types.TaskModel, ch chan<- struct{}) { | |||
| var wg sync.WaitGroup | |||
| for _, task := range tasks { | |||
| if task.AdapterTypeDict != 1 { | |||
| continue | |||
| } | |||
| if task.Status == constants.Succeeded || task.Status == constants.Failed { | |||
| continue | |||
| } | |||
| var aiTaskList []*models.TaskAi | |||
| tx := l.svcCtx.DbEngin.Raw("select * from task_ai where `task_id` = ? ", task.Id).Scan(&aiTaskList) | |||
| if tx.Error != nil { | |||
| logx.Errorf(tx.Error.Error()) | |||
| return | |||
| } | |||
| if len(aiTaskList) == 0 { | |||
| continue | |||
| } | |||
| for _, aitask := range aiTaskList { | |||
| t := aitask | |||
| if t.Status == constants.Completed { | |||
| continue | |||
| } | |||
| wg.Add(1) | |||
| go func() { | |||
| trainingTask, err := l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[strconv.FormatInt(t.AdapterId, 10)][strconv.FormatInt(t.ClusterId, 10)].GetTrainingTask(l.ctx, t.JobId) | |||
| if err != nil { | |||
| msg := fmt.Sprintf("AiTaskId: %v, clusterId: %v , JobId: %v, error: %v \n", t.Id, t.ClusterId, t.JobId, err.Error()) | |||
| logx.Errorf(errors.New(msg).Error()) | |||
| wg.Done() | |||
| return | |||
| } | |||
| t.Status = trainingTask.Status | |||
| t.StartTime = trainingTask.Start | |||
| t.EndTime = trainingTask.End | |||
| err = l.svcCtx.Scheduler.AiStorages.UpdateAiTask(t) | |||
| if err != nil { | |||
| msg := fmt.Sprintf("AiTaskId: %v, clusterId: %v , JobId: %v, error: %v \n", t.Id, t.ClusterId, t.JobId, err.Error()) | |||
| logx.Errorf(errors.New(msg).Error()) | |||
| wg.Done() | |||
| return | |||
| } | |||
| wg.Done() | |||
| }() | |||
| } | |||
| } | |||
| wg.Wait() | |||
| ch <- struct{}{} | |||
| } | |||
| @@ -2,6 +2,7 @@ package hpc | |||
| import ( | |||
| "context" | |||
| "errors" | |||
| clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/api/client" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" | |||
| @@ -63,7 +64,9 @@ func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *t | |||
| l.svcCtx.DbEngin.Raw("SELECT nickname FROM `t_cluster` where id = ?", clusterId).Scan(&clusterName) | |||
| l.svcCtx.DbEngin.Raw("SELECT adapter_id FROM `t_cluster` where id = ?", clusterId).Scan(&adapterId) | |||
| l.svcCtx.DbEngin.Raw("SELECT name FROM `t_adapter` where id = ?", adapterId).Scan(&adapterName) | |||
| if len(adapterName) == 0 || adapterName == "" { | |||
| return nil, errors.New("no corresponding adapter found") | |||
| } | |||
| env, _ := json.Marshal(req.Environment) | |||
| hpcInfo := models.TaskHpc{ | |||
| @@ -7,6 +7,8 @@ import ( | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" | |||
| "strconv" | |||
| "strings" | |||
| "github.com/zeromicro/go-zero/core/logx" | |||
| ) | |||
| @@ -32,6 +34,7 @@ func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *type | |||
| TaskName: req.AiOption.TaskName, | |||
| ResourceType: req.AiOption.ResourceType, | |||
| Replica: req.AiOption.Replica, | |||
| ComputeCard: req.AiOption.ComputeCard, | |||
| Tops: req.AiOption.Tops, | |||
| TaskType: req.AiOption.TaskType, | |||
| DatasetsName: req.AiOption.Datasets, | |||
| @@ -69,14 +72,22 @@ func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *type | |||
| for _, r := range rs { | |||
| scheResult := &types.ScheduleResult{} | |||
| scheResult.ClusterId = r.ClusterId | |||
| scheResult.TaskId = r.TaskId | |||
| scheResult.TaskId = strconv.FormatInt(id, 10) | |||
| scheResult.JobId = r.JobId | |||
| scheResult.Strategy = r.Strategy | |||
| scheResult.Card = strings.ToUpper(r.Card) | |||
| scheResult.Replica = r.Replica | |||
| scheResult.Msg = r.Msg | |||
| err := l.svcCtx.Scheduler.AiStorages.SaveAiTask(id, opt, r.ClusterId, r.TaskId, constants.Saved, r.Msg) | |||
| opt.ComputeCard = strings.ToUpper(r.Card) | |||
| clusterName, _ := l.svcCtx.Scheduler.AiStorages.GetClusterNameById(r.ClusterId) | |||
| err := l.svcCtx.Scheduler.AiStorages.SaveAiTask(id, opt, r.ClusterId, clusterName, r.JobId, constants.Saved, r.Msg) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| resp.Results = append(resp.Results, scheResult) | |||
| } | |||
| @@ -35,6 +35,16 @@ func (s *AiStorage) GetClustersByAdapterId(id string) (*types.ClusterListResp, e | |||
| return &resp, nil | |||
| } | |||
| func (s *AiStorage) GetClusterNameById(id string) (string, error) { | |||
| var name string | |||
| tx := s.DbEngin.Raw("select `description` from t_cluster where `id` = ?", id).Scan(&name) | |||
| if tx.Error != nil { | |||
| logx.Errorf(tx.Error.Error()) | |||
| return "", tx.Error | |||
| } | |||
| return name, nil | |||
| } | |||
| func (s *AiStorage) GetAdapterIdsByType(adapterType string) ([]string, error) { | |||
| var list []types.AdapterInfo | |||
| var ids []string | |||
| @@ -63,10 +73,11 @@ func (s *AiStorage) GetAdaptersByType(adapterType string) ([]*types.AdapterInfo, | |||
| func (s *AiStorage) GetAiTasksByAdapterId(adapterId string) ([]*models.TaskAi, error) { | |||
| var resp []*models.TaskAi | |||
| tx := s.DbEngin.Raw("select * from task_ai where `adapter_id` = ? ", adapterId).Scan(&resp) | |||
| if tx.Error != nil { | |||
| logx.Errorf(tx.Error.Error()) | |||
| return nil, tx.Error | |||
| db := s.DbEngin.Model(&models.TaskAi{}).Table("task_ai") | |||
| db = db.Where("adapter_id = ?", adapterId) | |||
| err := db.Order("commit_time desc").Find(&resp).Error | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| return resp, nil | |||
| } | |||
| @@ -90,7 +101,7 @@ func (s *AiStorage) SaveTask(name string, strategyCode int64, synergyStatus int6 | |||
| return taskModel.Id, nil | |||
| } | |||
| func (s *AiStorage) SaveAiTask(taskId int64, option *option.AiOption, clusterId string, jobId string, status string, msg string) error { | |||
| func (s *AiStorage) SaveAiTask(taskId int64, option *option.AiOption, clusterId string, clusterName string, jobId string, status string, msg string) error { | |||
| // 构建主任务结构体 | |||
| aId, err := strconv.ParseInt(option.AdapterId, 10, 64) | |||
| if err != nil { | |||
| @@ -100,18 +111,24 @@ func (s *AiStorage) SaveAiTask(taskId int64, option *option.AiOption, clusterId | |||
| if err != nil { | |||
| return err | |||
| } | |||
| del, _ := time.Parse(constants.Layout, constants.Layout) | |||
| aiTaskModel := models.TaskAi{ | |||
| TaskId: taskId, | |||
| AdapterId: aId, | |||
| ClusterId: cId, | |||
| Name: option.TaskName, | |||
| TaskId: taskId, | |||
| AdapterId: aId, | |||
| ClusterId: cId, | |||
| ClusterName: clusterName, | |||
| Name: option.TaskName, | |||
| Replica: int64(option.Replica), | |||
| JobId: jobId, | |||
| TaskType: option.TaskType, | |||
| Strategy: option.StrategyName, | |||
| Status: status, | |||
| Msg: msg, | |||
| CommitTime: time.Now(), | |||
| JobId: jobId, | |||
| TaskType: option.TaskType, | |||
| Strategy: option.StrategyName, | |||
| Status: status, | |||
| Msg: msg, | |||
| Card: option.ComputeCard, | |||
| DeletedAt: del, | |||
| CommitTime: time.Now(), | |||
| } | |||
| // 保存任务数据到数据库 | |||
| tx := s.DbEngin.Create(&aiTaskModel) | |||
| @@ -19,6 +19,7 @@ import ( | |||
| "encoding/json" | |||
| "errors" | |||
| "fmt" | |||
| "github.com/zeromicro/go-zero/core/logx" | |||
| "gitlink.org.cn/JointCloud/pcm-ac/hpcAC" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler" | |||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option" | |||
| @@ -42,10 +43,11 @@ type AiScheduler struct { | |||
| } | |||
| type AiResult struct { | |||
| TaskId string | |||
| JobId string | |||
| ClusterId string | |||
| Strategy string | |||
| Replica int32 | |||
| Card string | |||
| Msg string | |||
| } | |||
| @@ -156,6 +158,7 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) (interfa | |||
| result.Replica = c.Replicas | |||
| result.ClusterId = c.ClusterId | |||
| result.Strategy = as.option.StrategyName | |||
| result.Card = opt.ComputeCard | |||
| ch <- result | |||
| wg.Done() | |||
| @@ -192,28 +195,35 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) (interfa | |||
| }) | |||
| msg := fmt.Sprintf("clusterId: %v , error: %v \n", e.clusterId, e.err.Error()) | |||
| errmsg += msg | |||
| err := as.AiStorages.SaveAiTask(taskId, as.option, e.clusterId, "", constants.Failed, msg) | |||
| clusterName, _ := as.AiStorages.GetClusterNameById(e.clusterId) | |||
| err := as.AiStorages.SaveAiTask(taskId, as.option, e.clusterId, clusterName, "", constants.Failed, msg) | |||
| if err != nil { | |||
| return nil, errors.New("database add failed: " + err.Error()) | |||
| } | |||
| } | |||
| for _, s := range results { | |||
| as.option.ComputeCard = s.Card //execute card | |||
| clusterName, _ := as.AiStorages.GetClusterNameById(s.ClusterId) | |||
| if s.Msg != "" { | |||
| msg := fmt.Sprintf("clusterId: %v , error: %v \n", s.ClusterId, s.Msg) | |||
| errmsg += msg | |||
| err := as.AiStorages.SaveAiTask(taskId, as.option, s.ClusterId, "", constants.Failed, msg) | |||
| err := as.AiStorages.SaveAiTask(taskId, as.option, s.ClusterId, clusterName, "", constants.Failed, msg) | |||
| if err != nil { | |||
| return nil, errors.New("database add failed: " + err.Error()) | |||
| } | |||
| } else { | |||
| msg := fmt.Sprintf("clusterId: %v , submitted successfully, taskId: %v \n", s.ClusterId, s.TaskId) | |||
| msg := fmt.Sprintf("clusterId: %v , submitted successfully, jobId: %v \n", s.ClusterId, s.JobId) | |||
| errmsg += msg | |||
| err := as.AiStorages.SaveAiTask(taskId, as.option, s.ClusterId, s.TaskId, constants.Succeeded, msg) | |||
| err := as.AiStorages.SaveAiTask(taskId, as.option, s.ClusterId, clusterName, s.JobId, constants.Saved, msg) | |||
| if err != nil { | |||
| return nil, errors.New("database add failed: " + err.Error()) | |||
| } | |||
| } | |||
| } | |||
| logx.Errorf(errors.New(errmsg).Error()) | |||
| return nil, errors.New(errmsg) | |||
| } | |||
| @@ -288,7 +298,7 @@ func convertType(in interface{}) (*AiResult, error) { | |||
| case *hpcAC.SubmitTaskAiResp: | |||
| resp := (in).(*hpcAC.SubmitTaskAiResp) | |||
| if resp.Code == "0" { | |||
| result.TaskId = resp.Data | |||
| result.JobId = resp.Data | |||
| } else { | |||
| result.Msg = resp.Msg | |||
| } | |||
| @@ -297,7 +307,7 @@ func convertType(in interface{}) (*AiResult, error) { | |||
| resp := (in).(*octopus.CreateTrainJobResp) | |||
| if resp.Success { | |||
| result.TaskId = resp.Payload.JobId | |||
| result.JobId = resp.Payload.JobId | |||
| } else { | |||
| result.Msg = resp.Error.Message | |||
| } | |||
| @@ -18,8 +18,9 @@ type Strategy interface { | |||
| } | |||
| type AssignedCluster struct { | |||
| ClusterId string | |||
| Replicas int32 | |||
| ClusterId string | |||
| ClusterName string | |||
| Replicas int32 | |||
| } | |||
| func GetStrategyNames() []string { | |||
| @@ -402,7 +402,7 @@ func (o *OctopusLink) DownloadAlgorithmCode(ctx context.Context, resourceType st | |||
| aLatest := &octopus.Algorithms{} | |||
| for i, _ := range algorithms { | |||
| if time.Unix(aLatest.CreatedAt, 0).After(time.Unix(algorithms[i].CreatedAt, 0)) { | |||
| if time.Unix(algorithms[i].CreatedAt, 0).After(time.Unix(aLatest.CreatedAt, 0)) { | |||
| aLatest = algorithms[i] | |||
| } | |||
| } | |||
| @@ -493,7 +493,11 @@ func (o *OctopusLink) GetTrainingTask(ctx context.Context, taskId string) (*coll | |||
| } | |||
| jobresp, ok := (resp).(*octopus.GetTrainJobResp) | |||
| if !jobresp.Success || !ok { | |||
| return nil, errors.New("get training task failed") | |||
| if jobresp.Error != nil { | |||
| return nil, errors.New(jobresp.Error.Message) | |||
| } else { | |||
| return nil, errors.New("get training task failed, empty error returned") | |||
| } | |||
| } | |||
| var task collector.Task | |||
| task.Id = jobresp.Payload.TrainJob.Id | |||
| @@ -508,6 +512,8 @@ func (o *OctopusLink) GetTrainingTask(ctx context.Context, taskId string) (*coll | |||
| task.Status = constants.Running | |||
| case "stopped": | |||
| task.Status = constants.Stopped | |||
| case "pending": | |||
| task.Status = constants.Pending | |||
| default: | |||
| task.Status = "undefined" | |||
| } | |||
| @@ -662,9 +668,22 @@ func (o *OctopusLink) generateImageId(ctx context.Context, option *option.AiOpti | |||
| if option.ResourceType == CARD { | |||
| for _, image := range preImgResp.Payload.Images { | |||
| if strings.Contains(image.ImageName, cardAliasMap[option.ComputeCard]) { | |||
| option.ImageId = image.Id | |||
| return nil | |||
| if strings.Contains(image.ImageName, cardAliasMap[strings.ToUpper(option.ComputeCard)]) { | |||
| switch strings.ToUpper(option.ComputeCard) { | |||
| case GCU: | |||
| if strings.HasPrefix(image.ImageVersion, "t20_") { | |||
| option.ImageId = image.Id | |||
| return nil | |||
| } | |||
| case BIV100: | |||
| if strings.HasPrefix(image.ImageVersion, "bi_") { | |||
| option.ImageId = image.Id | |||
| return nil | |||
| } | |||
| case MLU: | |||
| option.ImageId = image.Id | |||
| return nil | |||
| } | |||
| } | |||
| } | |||
| } | |||
| @@ -750,7 +769,7 @@ func setResourceIdByCard(option *option.AiOption, specs *octopus.GetResourceSpec | |||
| if spec.Price == 1 { | |||
| ns := strings.Split(spec.Name, COMMA) | |||
| cardSpecs := strings.Split(ns[0], STAR) | |||
| if cardSpecs[1] == cardCnMap[computeCard] { | |||
| if cardSpecs[1] == cardCnMap[strings.ToUpper(computeCard)] { | |||
| option.ResourceId = spec.Id | |||
| option.ComputeCard = computeCard | |||
| return nil | |||
| @@ -766,7 +785,7 @@ func setResourceIdByCard(option *option.AiOption, specs *octopus.GetResourceSpec | |||
| if spec.Price == 1 { | |||
| ns := strings.Split(spec.Name, COMMA) | |||
| cardSpecs := strings.Split(ns[0], STAR) | |||
| if cardSpecs[1] == cardCnMap[computeCard] { | |||
| if cardSpecs[1] == cardCnMap[strings.ToUpper(computeCard)] { | |||
| option.ResourceId = spec.Id | |||
| option.ComputeCard = computeCard | |||
| return nil | |||
| @@ -780,7 +799,7 @@ func setResourceIdByCard(option *option.AiOption, specs *octopus.GetResourceSpec | |||
| continue | |||
| } | |||
| cardSpecs := strings.Split(ns[0], STAR) | |||
| if cardSpecs[1] != cardCnMap[computeCard] { | |||
| if cardSpecs[1] != cardCnMap[strings.ToUpper(computeCard)] { | |||
| continue | |||
| } | |||
| s, err := strconv.ParseFloat(cardSpecs[0], 64) | |||
| @@ -789,36 +808,32 @@ func setResourceIdByCard(option *option.AiOption, specs *octopus.GetResourceSpec | |||
| } | |||
| switch computeCard { | |||
| case GCU: | |||
| option.ComputeCard = computeCard | |||
| if cardNum == s { // 1, 4, 8 | |||
| option.ResourceId = spec.Id | |||
| option.ComputeCard = computeCard | |||
| return nil | |||
| } | |||
| if 1 < cardNum && cardNum <= 4 && s == 4 { | |||
| option.ResourceId = spec.Id | |||
| option.ComputeCard = computeCard | |||
| return nil | |||
| } | |||
| if 4 < cardNum && s == 8 { | |||
| option.ResourceId = spec.Id | |||
| option.ComputeCard = computeCard | |||
| return nil | |||
| } | |||
| case MLU: // 1, 2, 4 | |||
| option.ComputeCard = computeCard | |||
| if cardNum/2 == s { | |||
| option.ResourceId = spec.Id | |||
| option.ComputeCard = computeCard | |||
| return nil | |||
| } | |||
| if 1 < cardNum/2 && cardNum/2 <= 2 && s == 2 { | |||
| option.ResourceId = spec.Id | |||
| option.ComputeCard = computeCard | |||
| return nil | |||
| } | |||
| if 2 < cardNum/2 && s == 4 { | |||
| option.ResourceId = spec.Id | |||
| option.ComputeCard = computeCard | |||
| return nil | |||
| } | |||
| } | |||
| @@ -565,10 +565,14 @@ func (s *ShuguangAi) generateResourceId(option *option.AiOption) error { | |||
| if option.ResourceType == CPU { | |||
| option.ResourceId = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi" | |||
| option.ComputeCard = CPU | |||
| return nil | |||
| } | |||
| if option.ResourceType == CARD { | |||
| option.ComputeCard = DCU | |||
| if 0 <= option.Tops && option.Tops <= DCU_TOPS { | |||
| option.ResourceId = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi" | |||
| return nil | |||
| @@ -82,8 +82,8 @@ var ( | |||
| "3": SHUGUANGAI, | |||
| "4": SHUGUANGHPC, | |||
| } | |||
| resourceTypes = []string{CPU, CARD} | |||
| taskTypes = []string{PYTORCH_TASK, TENSORFLOW_TASK} | |||
| resourceTypes = []string{CARD} | |||
| taskTypes = []string{PYTORCH_TASK} | |||
| ERROR_RESP_EMPTY = errors.New("resp empty error") | |||
| ERROR_CONVERT_EMPTY = errors.New("convert empty error") | |||
| @@ -91,7 +91,7 @@ func NewServiceContext(c config.Config) *ServiceContext { | |||
| NamingStrategy: schema.NamingStrategy{ | |||
| SingularTable: true, // 使用单数表名,启用该选项,此时,`User` 的表名应该是 `t_user` | |||
| }, | |||
| Logger: logger.Default.LogMode(logger.Info), | |||
| Logger: logger.Default.LogMode(logger.Error), | |||
| }) | |||
| if err != nil { | |||
| logx.Errorf("数据库连接失败, err%v", err) | |||
| @@ -1164,7 +1164,7 @@ type CommitHpcTaskReq struct { | |||
| Description string `json:"description,optional"` | |||
| TenantId int64 `json:"tenantId,optional"` | |||
| TaskId int64 `json:"taskId,optional"` | |||
| AdapterIds []string `json:"adapterId"` | |||
| AdapterIds []string `json:"adapterIds"` | |||
| MatchLabels map[string]string `json:"matchLabels,optional"` | |||
| CardCount int64 `json:"cardCount,optional"` | |||
| WorkDir string `json:"workDir,optional"` //paratera:workingDir | |||
| @@ -2841,6 +2841,8 @@ type CenterTaskListResp struct { | |||
| type AiTask struct { | |||
| Name string `json:"name,optional"` | |||
| Status string `json:"status,optional"` | |||
| Cluster string `json:"cluster,optional"` | |||
| Card string `json:"card,optional"` | |||
| TimeElapsed int32 `json:"elapsed,optional"` | |||
| } | |||
| @@ -5620,7 +5622,9 @@ type ScheduleResp struct { | |||
| type ScheduleResult struct { | |||
| ClusterId string `json:"clusterId"` | |||
| TaskId string `json:"taskId"` | |||
| Card string `json:"card"` | |||
| Strategy string `json:"strategy"` | |||
| JobId string `json:"jobId"` | |||
| Replica int32 `json:"replica"` | |||
| Msg string `json:"msg"` | |||
| } | |||
| @@ -5633,6 +5637,7 @@ type AiOption struct { | |||
| AdapterId string `json:"adapterId"` | |||
| AiClusterIds []string `json:"aiClusterIds"` | |||
| ResourceType string `json:"resourceType"` | |||
| ComputeCard string `json:"card"` | |||
| Tops float64 `json:"Tops,optional"` | |||
| TaskType string `json:"taskType"` | |||
| Datasets string `json:"datasets"` | |||
| @@ -36,20 +36,24 @@ type ( | |||
| } | |||
| TaskAi struct { | |||
| Id int64 `db:"id"` // id | |||
| TaskId int64 `db:"task_id"` // 任务id | |||
| AdapterId int64 `db:"adapter_id"` // 设配器id | |||
| ClusterId int64 `db:"cluster_id"` // 集群id | |||
| Name string `db:"name"` // 任务名 | |||
| Replica int64 `db:"replica"` // 执行数 | |||
| JobId string `db:"job_id"` // 集群返回任务id | |||
| Strategy string `db:"strategy"` // 主任务使用策略 | |||
| Status string `db:"status"` // 任务状态 | |||
| Msg string `db:"msg"` // 集群返回任务信息 | |||
| CommitTime time.Time `db:"commit_time"` // 提交时间 | |||
| StartTime string `db:"start_time"` // 开始时间 | |||
| EndTime string `db:"end_time"` // 结束时间 | |||
| TaskType string `db:"task_type"` | |||
| Id int64 `db:"id"` // id | |||
| TaskId int64 `db:"task_id"` // 任务id | |||
| AdapterId int64 `db:"adapter_id"` // 适配器id | |||
| AdapterName string `db:"adapter_name"` // 适配器名称 | |||
| ClusterId int64 `db:"cluster_id"` // 集群id | |||
| ClusterName string `db:"cluster_name"` // 集群名称 | |||
| Name string `db:"name"` // 任务名 | |||
| Replica int64 `db:"replica"` // 执行数 | |||
| JobId string `db:"job_id"` // 集群返回任务id | |||
| Strategy string `db:"strategy"` // 主任务使用策略 | |||
| Status string `db:"status"` // 任务状态 | |||
| Msg string `db:"msg"` // 集群返回任务信息 | |||
| CommitTime time.Time `db:"commit_time"` // 提交时间 | |||
| StartTime string `db:"start_time"` // 开始时间 | |||
| EndTime string `db:"end_time"` // 结束时间 | |||
| TaskType string `db:"task_type"` | |||
| DeletedAt time.Time `db:"deleted_at"` | |||
| Card string `db:"card"` | |||
| } | |||
| ) | |||
| @@ -88,14 +92,14 @@ func (m *defaultTaskAiModel) FindOne(ctx context.Context, id int64) (*TaskAi, er | |||
| } | |||
| func (m *defaultTaskAiModel) Insert(ctx context.Context, data *TaskAi) (sql.Result, error) { | |||
| query := fmt.Sprintf("insert into %s (%s) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", m.table, taskAiRowsExpectAutoSet) | |||
| ret, err := m.conn.ExecCtx(ctx, query, data.TaskId, data.AdapterId, data.ClusterId, data.Name, data.Replica, data.JobId, data.Strategy, data.Status, data.Msg, data.CommitTime, data.StartTime, data.EndTime, data.TaskType) | |||
| query := fmt.Sprintf("insert into %s (%s) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", m.table, taskAiRowsExpectAutoSet) | |||
| ret, err := m.conn.ExecCtx(ctx, query, data.TaskId, data.AdapterId, data.AdapterName, data.ClusterId, data.ClusterName, data.Name, data.Replica, data.JobId, data.Strategy, data.Status, data.Msg, data.CommitTime, data.StartTime, data.EndTime, data.TaskType, data.DeletedAt, data.Card) | |||
| return ret, err | |||
| } | |||
| func (m *defaultTaskAiModel) Update(ctx context.Context, data *TaskAi) error { | |||
| query := fmt.Sprintf("update %s set %s where `id` = ?", m.table, taskAiRowsWithPlaceHolder) | |||
| _, err := m.conn.ExecCtx(ctx, query, data.TaskId, data.AdapterId, data.ClusterId, data.Name, data.Replica, data.JobId, data.Strategy, data.Status, data.Msg, data.CommitTime, data.StartTime, data.EndTime, data.TaskType, data.Id) | |||
| _, err := m.conn.ExecCtx(ctx, query, data.TaskId, data.AdapterId, data.AdapterName, data.ClusterId, data.ClusterName, data.Name, data.Replica, data.JobId, data.Strategy, data.Status, data.Msg, data.CommitTime, data.StartTime, data.EndTime, data.TaskType, data.DeletedAt, data.Card, data.Id) | |||
| return err | |||
| } | |||