| @@ -6,6 +6,7 @@ type Options struct { | |||||
| } | } | ||||
| type Client interface { | type Client interface { | ||||
| Task(TaskOptions) (Task, error) | Task(TaskOptions) (Task, error) | ||||
| Notice(NoticeOptions) (Notice, error) | |||||
| } | } | ||||
| func NewClient(options Options) (Client, error) { | func NewClient(options Options) (Client, error) { | ||||
| @@ -19,6 +19,11 @@ func (c *client) Task(options TaskOptions) (Task, error) { | |||||
| return task, nil | return task, nil | ||||
| } | } | ||||
| func (c *client) Notice(options NoticeOptions) (Notice, error) { | |||||
| notice, _ := newNotice(c, &options) | |||||
| return notice, nil | |||||
| } | |||||
| func newClient(options Options) (Client, error) { | func newClient(options Options) (Client, error) { | ||||
| //init dbEngine | //init dbEngine | ||||
| dbEngin, _ := gorm.Open(mysql.Open(options.DataSource), &gorm.Config{ | dbEngin, _ := gorm.Open(mysql.Open(options.DataSource), &gorm.Config{ | ||||
| @@ -0,0 +1,9 @@ | |||||
| package client | |||||
| type NoticeOptions struct { | |||||
| pushNoticeReq PushNoticeReq | |||||
| } | |||||
| type Notice interface { | |||||
| PushNotice(pushNoticeReq PushNoticeReq) (*PushNoticeResp, error) | |||||
| } | |||||
| @@ -0,0 +1,46 @@ | |||||
| package client | |||||
| import ( | |||||
| "io/ioutil" | |||||
| "k8s.io/apimachinery/pkg/util/json" | |||||
| "log" | |||||
| "net/http" | |||||
| "strings" | |||||
| "sync" | |||||
| ) | |||||
| type notice struct { | |||||
| sync.RWMutex | |||||
| client *client | |||||
| options *NoticeOptions | |||||
| log log.Logger | |||||
| } | |||||
| func newNotice(client *client, options *NoticeOptions) (*notice, error) { | |||||
| notice := ¬ice{ | |||||
| RWMutex: sync.RWMutex{}, | |||||
| client: client, | |||||
| options: options, | |||||
| log: log.Logger{}, | |||||
| } | |||||
| return notice, nil | |||||
| } | |||||
| func (n *notice) PushNotice(pushNoticeReq PushNoticeReq) (*PushNoticeResp, error) { | |||||
| url := n.client.url + "/pcm/v1/core/pushNotice" | |||||
| method := "GET" | |||||
| jsonStr, _ := json.Marshal(pushNoticeReq) | |||||
| payload := strings.NewReader(string(jsonStr)) | |||||
| client := &http.Client{} | |||||
| req, _ := http.NewRequest(method, url, payload) | |||||
| req.Header.Add("Content-Type", "application/json") | |||||
| res, _ := client.Do(req) | |||||
| defer res.Body.Close() | |||||
| body, _ := ioutil.ReadAll(res.Body) | |||||
| var resp PushNoticeResp | |||||
| json.Unmarshal(body, &resp) | |||||
| return &resp, nil | |||||
| } | |||||
| @@ -9,5 +9,5 @@ type TaskOptions struct { | |||||
| type Task interface { | type Task interface { | ||||
| PullTaskInfo(pullTaskInfoReq PullTaskInfoReq) (*PullTaskInfoResp, error) | PullTaskInfo(pullTaskInfoReq PullTaskInfoReq) (*PullTaskInfoResp, error) | ||||
| PushTaskInfo(pushTaskInfoReq PushTaskInfoReq) (*PushTaskInfoResp, error) | PushTaskInfo(pushTaskInfoReq PushTaskInfoReq) (*PushTaskInfoResp, error) | ||||
| PushResourceInfo(pushResourceInfoReq PushResourceInfoReq) error | |||||
| PushResourceInfo(pushResourceInfoReq PushResourceInfoReq) (*PushResourceInfoResp, error) | |||||
| } | } | ||||
| @@ -50,8 +50,8 @@ func (t *task) PushTaskInfo(pushTaskInfoReq PushTaskInfoReq) (*PushTaskInfoResp, | |||||
| url := t.client.url + "/pcm/v1/core/pushTaskInfo" | url := t.client.url + "/pcm/v1/core/pushTaskInfo" | ||||
| method := "POST" | method := "POST" | ||||
| infoReq := PullTaskInfoReq{AdapterId: pushTaskInfoReq.AdapterId} | |||||
| jsonStr, _ := json.Marshal(infoReq) | |||||
| //infoReq := PullTaskInfoReq{AdapterId: pushTaskInfoReq.AdapterId} | |||||
| jsonStr, _ := json.Marshal(pushTaskInfoReq) | |||||
| payload := strings.NewReader(string(jsonStr)) | payload := strings.NewReader(string(jsonStr)) | ||||
| client := &http.Client{} | client := &http.Client{} | ||||
| @@ -66,7 +66,22 @@ func (t *task) PushTaskInfo(pushTaskInfoReq PushTaskInfoReq) (*PushTaskInfoResp, | |||||
| return &resp, nil | return &resp, nil | ||||
| } | } | ||||
| func (t *task) PushResourceInfo(pushResourceInfoReq PushResourceInfoReq) error { | |||||
| //TODO implement me | |||||
| panic("implement me") | |||||
| func (t *task) PushResourceInfo(pushResourceInfoReq PushResourceInfoReq) (*PushResourceInfoResp, error) { | |||||
| url := t.client.url + "/pcm/v1/core/pushResourceInfo" | |||||
| method := "POST" | |||||
| //infoReq := PushResourceInfoReq{AdapterId: pushResourceInfoReq.AdapterId} | |||||
| jsonStr, _ := json.Marshal(pushResourceInfoReq) | |||||
| payload := strings.NewReader(string(jsonStr)) | |||||
| client := &http.Client{} | |||||
| req, _ := http.NewRequest(method, url, payload) | |||||
| req.Header.Add("Content-Type", "application/json") | |||||
| res, _ := client.Do(req) | |||||
| defer res.Body.Close() | |||||
| body, _ := ioutil.ReadAll(res.Body) | |||||
| var resp PushResourceInfoResp | |||||
| json.Unmarshal(body, &resp) | |||||
| return &resp, nil | |||||
| } | } | ||||
| @@ -25,12 +25,46 @@ type PushTaskInfoReq struct { | |||||
| } | } | ||||
| type PushTaskInfoResp struct { | type PushTaskInfoResp struct { | ||||
| Code int64 | |||||
| Msg string | |||||
| Code int64 `json:"code"` | |||||
| Msg string `json:"msg"` | |||||
| } | } | ||||
| type PushResourceInfoReq struct { | type PushResourceInfoReq struct { | ||||
| AdapterId int64 `json:"adapterId"` | |||||
| AdapterId int64 `json:"adapterId"` | |||||
| ResourceStats []ResourceStats `json:"resourceStats"` | |||||
| } | |||||
| type PushResourceInfoResp struct { | |||||
| Code int64 `json:"code"` | |||||
| Msg string `json:"msg"` | |||||
| } | |||||
| type NoticeInfo struct { | |||||
| AdapterId int64 `json:"adapterId"` | |||||
| AdapterName string `json:"adapterName"` | |||||
| ClusterId int64 `json:"clusterId"` | |||||
| ClusterName string `json:"clusterName"` | |||||
| NoticeType string `json:"noticeType"` | |||||
| TaskName string `json:"taskName"` | |||||
| Incident string `json:"incident"` | |||||
| CreatedTime time.Time `json:"createdTime"` | |||||
| } | |||||
| type ListNoticeReq struct { | |||||
| } | |||||
| type ListNoticeResp struct { | |||||
| Code int64 `json:"code"` | |||||
| Msg string `json:"msg"` | |||||
| Data []NoticeInfo `json:"data"` | |||||
| } | |||||
| type PushNoticeReq struct { | |||||
| NoticeInfo NoticeInfo `json:"noticeInfo"` | |||||
| } | |||||
| type PushNoticeResp struct { | |||||
| Code int64 `json:"code"` | |||||
| Msg string `json:"msg"` | |||||
| } | } | ||||
| type HpcInfo struct { | type HpcInfo struct { | ||||
| @@ -119,5 +153,30 @@ type VmInfo struct { | |||||
| BlockUuid string `json:"block_uuid,omitempty"` | BlockUuid string `json:"block_uuid,omitempty"` | ||||
| SourceType string `json:"source_type,omitempty"` | SourceType string `json:"source_type,omitempty"` | ||||
| DeleteOnTermination bool `json:"delete_on_termination,omitempty"` | DeleteOnTermination bool `json:"delete_on_termination,omitempty"` | ||||
| State string `json:"state,omitempty"` | |||||
| Status string `json:"Status,omitempty"` | |||||
| StartTime string `json:"startTime,omitempty"` | |||||
| } | |||||
| type ResourceStats struct { | |||||
| ClusterId int64 `json:"clusterId"` | |||||
| Name string `json:"name"` | |||||
| CpuCoreAvail int64 `json:"cpuCoreAvail"` | |||||
| CpuCoreTotal int64 `json:"cpuCoreTotal"` | |||||
| MemAvail float64 `json:"memAvail"` | |||||
| MemTotal float64 `json:"memTotal"` | |||||
| DiskAvail float64 `json:"diskAvail"` | |||||
| DiskTotal float64 `json:"diskTotal"` | |||||
| GpuAvail int64 `json:"gpuAvail"` | |||||
| CardsAvail []*Card `json:"cardsAvail"` | |||||
| CpuCoreHours float64 `json:"cpuCoreHours"` | |||||
| Balance float64 `json:"balance"` | |||||
| } | |||||
| type Card struct { | |||||
| Platform string `json:"platform"` | |||||
| Type string `json:"type"` | |||||
| Name string `json:"name"` | |||||
| TOpsAtFp16 float64 `json:"TOpsAtFp16"` | |||||
| CardHours float64 `json:"cardHours"` | |||||
| CardNum int32 `json:"cardNum"` | |||||
| } | } | ||||
| @@ -1,126 +0,0 @@ | |||||
| syntax = "v1" | |||||
| info( | |||||
| title: "type title here" | |||||
| desc: "type desc here" | |||||
| author: "type author here" | |||||
| email: "type email here" | |||||
| version: "type version here" | |||||
| ) | |||||
| type PullTaskInfoReq { | |||||
| AdapterId int64 `form:"adapterId"` | |||||
| } | |||||
| type PullTaskInfoResp struct { | |||||
| HpcInfoList []*HpcInfo `json:"HpcInfoList,omitempty"` | |||||
| CloudInfoList []*CloudInfo `json:"CloudInfoList,omitempty"` | |||||
| AiInfoList []*AiInfo `json:"AiInfoList,omitempty"` | |||||
| VmInfoList []*VmInfo `json:"VmInfoList,omitempty"` | |||||
| } | |||||
| type HpcInfo struct { | |||||
| Id int64 `json:"id"` // id | |||||
| TaskId int64 `json:"task_id"` // 任务id | |||||
| JobId string `json:"job_id"` // 作业id(在第三方系统中的作业id) | |||||
| AdapterId int64 `json:"adapter_id"` // 执行任务的适配器id | |||||
| ClusterId int64 `json:"cluster_id"` // 执行任务的集群id | |||||
| ClusterType string `json:"cluster_type"` // 执行任务的集群类型 | |||||
| Name string `json:"name"` // 名称 | |||||
| Status string `json:"status"` // 状态 | |||||
| CmdScript string `json:"cmd_script"` | |||||
| StartTime string `json:"start_time"` // 开始时间 | |||||
| RunningTime int64 `json:"running_time"` // 运行时间 | |||||
| DerivedEs string `json:"derived_es"` | |||||
| Cluster string `json:"cluster"` | |||||
| BlockId int64 `json:"block_id"` | |||||
| AllocNodes int64 `json:"alloc_nodes"` | |||||
| AllocCpu int64 `json:"alloc_cpu"` | |||||
| CardCount int64 `json:"card_count"` // 卡数 | |||||
| Version string `json:"version"` | |||||
| Account string `json:"account"` | |||||
| WorkDir string `json:"work_dir"` // 工作路径 | |||||
| AssocId int64 `json:"assoc_id"` | |||||
| ExitCode int64 `json:"exit_code"` | |||||
| WallTime string `json:"wall_time"` // 最大运行时间 | |||||
| Result string `json:"result"` // 运行结果 | |||||
| DeletedAt string `json:"deleted_at"` // 删除时间 | |||||
| YamlString string `json:"yaml_string"` | |||||
| AppType string `json:"app_type"` // 应用类型 | |||||
| AppName string `json:"app_name"` // 应用名称 | |||||
| Queue string `json:"queue"` // 队列名称 | |||||
| SubmitType string `json:"submit_type"` // cmd(命令行模式) | |||||
| NNode string `json:"n_node"` // 节点个数(当指定该参数时,GAP_NODE_STRING必须为"") | |||||
| StdOutFile string `json:"std_out_file"` // 工作路径/std.err.%j | |||||
| StdErrFile string `json:"std_err_file"` // 工作路径/std.err.%j | |||||
| StdInput string `json:"std_input"` | |||||
| Environment string `json:"environment"` | |||||
| DeletedFlag int64 `json:"deleted_flag"` // 是否删除(0-否,1-是) | |||||
| CreatedBy int64 `json:"created_by"` // 创建人 | |||||
| CreatedTime string `json:"created_time"` // 创建时间 | |||||
| UpdatedBy int64 `json:"updated_by"` // 更新人 | |||||
| UpdatedTime string `json:"updated_time"` // 更新时间 | |||||
| } | |||||
| type CloudInfo struct { | |||||
| Participant int64 `json:"participant,omitempty"` | |||||
| Id int64 `json:"id,omitempty"` | |||||
| TaskId int64 `json:"taskId,omitempty"` | |||||
| ApiVersion string `json:"apiVersion,omitempty"` | |||||
| Kind string `json:"kind,omitempty"` | |||||
| Namespace string `json:"namespace,omitempty"` | |||||
| Name string `json:"name,omitempty"` | |||||
| Status string `json:"status,omitempty"` | |||||
| StartTime string `json:"startTime,omitempty"` | |||||
| RunningTime int64 `json:"runningTime,omitempty"` | |||||
| Result string `json:"result,omitempty"` | |||||
| YamlString string `json:"yamlString,omitempty"` | |||||
| } | |||||
| type AiInfo struct { | |||||
| ParticipantId int64 `json:"participantId,omitempty"` | |||||
| TaskId int64 `json:"taskId,omitempty"` | |||||
| ProjectId string `json:"project_id,omitempty"` | |||||
| Name string `json:"name,omitempty"` | |||||
| Status string `json:"status,omitempty"` | |||||
| StartTime string `json:"startTime,omitempty"` | |||||
| RunningTime int64 `json:"runningTime,omitempty"` | |||||
| Result string `json:"result,omitempty"` | |||||
| JobId string `json:"jobId,omitempty"` | |||||
| CreateTime string `json:"createTime,omitempty"` | |||||
| ImageUrl string `json:"imageUrl,omitempty"` | |||||
| Command string `json:"command,omitempty"` | |||||
| FlavorId string `json:"flavorId,omitempty"` | |||||
| SubscriptionId string `json:"subscriptionId,omitempty"` | |||||
| ItemVersionId string `json:"itemVersionId,omitempty"` | |||||
| } | |||||
| type VmInfo struct { | |||||
| ParticipantId int64 `json:"participantId,omitempty"` | |||||
| TaskId int64 `json:"taskId,omitempty"` | |||||
| Name string `json:"name,omitempty"` | |||||
| FlavorRef string `json:"flavor_ref,omitempty"` | |||||
| ImageRef string `json:"image_ref,omitempty"` | |||||
| NetworkUuid string `json:"network_uuid,omitempty"` | |||||
| BlockUuid string `json:"block_uuid,omitempty"` | |||||
| SourceType string `json:"source_type,omitempty"` | |||||
| DeleteOnTermination bool `json:"delete_on_termination,omitempty"` | |||||
| State string `json:"state,omitempty"` | |||||
| } | |||||
| type PushTaskInfoReq struct { | |||||
| AdapterId int64 `json:"adapterId"` | |||||
| HpcInfoList []*HpcInfo `json:"hpcInfoList"` | |||||
| CloudInfoList []*CloudInfo `json:"cloudInfoList"` | |||||
| AiInfoList []*AiInfo `json:"aiInfoList"` | |||||
| VmInfoList []*VmInfo `json:"vmInfoList"` | |||||
| } | |||||
| type PushTaskInfoResp struct { | |||||
| Code int64 `json:"code"` | |||||
| Msg string `json:"msg"` | |||||
| } | |||||
| type PushResourceInfoReq struct { | |||||
| AdapterId int64 `json:"adapterId"` | |||||
| } | |||||
| @@ -9,7 +9,6 @@ import ( | |||||
| "cloud/pcm-cloud.api" | "cloud/pcm-cloud.api" | ||||
| "storelink/pcm-storelink.api" | "storelink/pcm-storelink.api" | ||||
| "schedule/pcm-schedule.api" | "schedule/pcm-schedule.api" | ||||
| "participant/pcm-participant.api" | |||||
| "monitoring/pcm-monitoring.api" | "monitoring/pcm-monitoring.api" | ||||
| ) | ) | ||||
| @@ -111,14 +110,26 @@ service pcm { | |||||
| @handler metricsHandler | @handler metricsHandler | ||||
| get /core/metrics | get /core/metrics | ||||
| @doc "provided to participant to pull task info from core" | |||||
| @doc "provide for adapter to pull task info from core" | |||||
| @handler pullTaskInfoHandler | @handler pullTaskInfoHandler | ||||
| get /core/pullTaskInfo (PullTaskInfoReq) returns (PullTaskInfoResp) | get /core/pullTaskInfo (PullTaskInfoReq) returns (PullTaskInfoResp) | ||||
| @doc "provided to participant to push task info to core" | |||||
| @doc "provide for adapter to push task info to core" | |||||
| @handler pushTaskInfoHandler | @handler pushTaskInfoHandler | ||||
| post /core/pushTaskInfo (PushTaskInfoReq) returns (PushTaskInfoResp) | post /core/pushTaskInfo (PushTaskInfoReq) returns (PushTaskInfoResp) | ||||
| @doc "provide for adapter to push resource info to core" | |||||
| @handler pushResourceInfoHandler | |||||
| post /core/pushResourceInfo (PushResourceInfoReq) returns (PushResourceInfoResp) | |||||
| @doc "provide for adapter to push notice info to core" | |||||
| @handler pushNoticeHandler | |||||
| post /core/pushNotice (PushNoticeReq) returns (PushNoticeResp) | |||||
| @doc "list notice" | |||||
| @handler listNoticeHandler | |||||
| get /core/listNotice (ListNoticeReq) returns (ListNoticeResp) | |||||
| @doc "paging queries the task list" | @doc "paging queries the task list" | ||||
| @handler pageListTaskHandler | @handler pageListTaskHandler | ||||
| get /core/task/list (pageTaskReq) returns(PageResult) | get /core/task/list (pageTaskReq) returns(PageResult) | ||||
| @@ -146,6 +157,10 @@ service pcm { | |||||
| @handler jobHandler | @handler jobHandler | ||||
| get /hpc/job (hpcJobReq) returns (hpcJobResp) | get /hpc/job (hpcJobReq) returns (hpcJobResp) | ||||
| @doc "超算资源总览" | |||||
| @handler resourceHandler | |||||
| get /hpc/resource (hpcResourceReq) returns (hpcResourceResp) | |||||
| @doc "超算查询资产列表" | @doc "超算查询资产列表" | ||||
| @handler queueAssetsHandler | @handler queueAssetsHandler | ||||
| get /hpc/queueAssets returns (QueueAssetsResp) | get /hpc/queueAssets returns (QueueAssetsResp) | ||||
| @@ -895,13 +910,13 @@ service pcm { | |||||
| get /schedule/ai/getTaskTypes returns (AiTaskTypesResp) | get /schedule/ai/getTaskTypes returns (AiTaskTypesResp) | ||||
| @handler ScheduleGetDatasetsHandler | @handler ScheduleGetDatasetsHandler | ||||
| get /schedule/ai/getDatasets returns (AiDatasetsResp) | |||||
| get /schedule/ai/getDatasets/:adapterId (AiDatasetsReq) returns (AiDatasetsResp) | |||||
| @handler ScheduleGetStrategyHandler | @handler ScheduleGetStrategyHandler | ||||
| get /schedule/ai/getStrategies returns (AiStrategyResp) | get /schedule/ai/getStrategies returns (AiStrategyResp) | ||||
| @handler ScheduleGetAlgorithmsHandler | @handler ScheduleGetAlgorithmsHandler | ||||
| get /schedule/ai/getAlgorithms/:resourceType/:taskType/:dataset (AiAlgorithmsReq) returns (AiAlgorithmsResp) | |||||
| get /schedule/ai/getAlgorithms/:adapterId/:resourceType/:taskType/:dataset (AiAlgorithmsReq) returns (AiAlgorithmsResp) | |||||
| @handler ScheduleSubmitHandler | @handler ScheduleSubmitHandler | ||||
| post /schedule/submit (ScheduleReq) returns (ScheduleResp) | post /schedule/submit (ScheduleReq) returns (ScheduleResp) | ||||
| @@ -19,13 +19,15 @@ type ( | |||||
| ScheduleResult { | ScheduleResult { | ||||
| ClusterId string `json:"clusterId"` | ClusterId string `json:"clusterId"` | ||||
| TaskId string `json:"taskId"` | TaskId string `json:"taskId"` | ||||
| Strategy string `json:"strategy"` | |||||
| Replica int32 `json:"replica"` | Replica int32 `json:"replica"` | ||||
| Msg string `json:"msg"` | Msg string `json:"msg"` | ||||
| } | } | ||||
| AiOption { | AiOption { | ||||
| TaskName string `json:"taskName"` | TaskName string `json:"taskName"` | ||||
| AiClusterId string `json:"aiClusterId,optional"` | |||||
| AdapterId string `json:"adapterId"` | |||||
| AiClusterIds []string `json:"aiClusterIds"` | |||||
| ResourceType string `json:"resourceType"` | ResourceType string `json:"resourceType"` | ||||
| Tops float64 `json:"Tops,optional"` | Tops float64 `json:"Tops,optional"` | ||||
| TaskType string `json:"taskType"` | TaskType string `json:"taskType"` | ||||
| @@ -46,6 +48,10 @@ type ( | |||||
| TaskTypes []string `json:"taskTypes"` | TaskTypes []string `json:"taskTypes"` | ||||
| } | } | ||||
| AiDatasetsReq { | |||||
| AdapterId string `path:"adapterId"` | |||||
| } | |||||
| AiDatasetsResp { | AiDatasetsResp { | ||||
| Datasets []string `json:"datasets"` | Datasets []string `json:"datasets"` | ||||
| } | } | ||||
| @@ -55,6 +61,7 @@ type ( | |||||
| } | } | ||||
| AiAlgorithmsReq { | AiAlgorithmsReq { | ||||
| AdapterId string `path:"adapterId"` | |||||
| ResourceType string `path:"resourceType"` | ResourceType string `path:"resourceType"` | ||||
| TaskType string `path:"taskType"` | TaskType string `path:"taskType"` | ||||
| Dataset string `path:"dataset"` | Dataset string `path:"dataset"` | ||||
| @@ -0,0 +1,28 @@ | |||||
| package core | |||||
| import ( | |||||
| clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/api/client" | |||||
| "net/http" | |||||
| "github.com/zeromicro/go-zero/rest/httpx" | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/core" | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" | |||||
| ) | |||||
| func ListNoticeHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { | |||||
| return func(w http.ResponseWriter, r *http.Request) { | |||||
| var req clientCore.ListNoticeReq | |||||
| if err := httpx.Parse(r, &req); err != nil { | |||||
| httpx.ErrorCtx(r.Context(), w, err) | |||||
| return | |||||
| } | |||||
| l := core.NewListNoticeLogic(r.Context(), svcCtx) | |||||
| resp, err := l.ListNotice(&req) | |||||
| if err != nil { | |||||
| httpx.ErrorCtx(r.Context(), w, err) | |||||
| } else { | |||||
| httpx.OkJsonCtx(r.Context(), w, resp) | |||||
| } | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,28 @@ | |||||
| package core | |||||
| import ( | |||||
| clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/api/client" | |||||
| "net/http" | |||||
| "github.com/zeromicro/go-zero/rest/httpx" | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/core" | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" | |||||
| ) | |||||
| func PushNoticeHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { | |||||
| return func(w http.ResponseWriter, r *http.Request) { | |||||
| var req clientCore.PushNoticeReq | |||||
| if err := httpx.Parse(r, &req); err != nil { | |||||
| httpx.ErrorCtx(r.Context(), w, err) | |||||
| return | |||||
| } | |||||
| l := core.NewPushNoticeLogic(r.Context(), svcCtx) | |||||
| resp, err := l.PushNotice(&req) | |||||
| if err != nil { | |||||
| httpx.ErrorCtx(r.Context(), w, err) | |||||
| } else { | |||||
| httpx.OkJsonCtx(r.Context(), w, resp) | |||||
| } | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,28 @@ | |||||
| package core | |||||
| import ( | |||||
| clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/api/client" | |||||
| "net/http" | |||||
| "github.com/zeromicro/go-zero/rest/httpx" | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/core" | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" | |||||
| ) | |||||
| func PushResourceInfoHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { | |||||
| return func(w http.ResponseWriter, r *http.Request) { | |||||
| var req clientCore.PushResourceInfoReq | |||||
| if err := httpx.Parse(r, &req); err != nil { | |||||
| httpx.ErrorCtx(r.Context(), w, err) | |||||
| return | |||||
| } | |||||
| l := core.NewPushResourceInfoLogic(r.Context(), svcCtx) | |||||
| resp, err := l.PushResourceInfo(&req) | |||||
| if err != nil { | |||||
| httpx.ErrorCtx(r.Context(), w, err) | |||||
| } else { | |||||
| httpx.OkJsonCtx(r.Context(), w, resp) | |||||
| } | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,28 @@ | |||||
| package hpc | |||||
| import ( | |||||
| "net/http" | |||||
| "github.com/zeromicro/go-zero/rest/httpx" | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/hpc" | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" | |||||
| ) | |||||
| func ResourceHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { | |||||
| return func(w http.ResponseWriter, r *http.Request) { | |||||
| var req types.HpcResourceReq | |||||
| if err := httpx.Parse(r, &req); err != nil { | |||||
| httpx.ErrorCtx(r.Context(), w, err) | |||||
| return | |||||
| } | |||||
| l := hpc.NewResourceLogic(r.Context(), svcCtx) | |||||
| resp, err := l.Resource(&req) | |||||
| if err != nil { | |||||
| httpx.ErrorCtx(r.Context(), w, err) | |||||
| } else { | |||||
| httpx.OkJsonCtx(r.Context(), w, resp) | |||||
| } | |||||
| } | |||||
| } | |||||
| @@ -140,6 +140,21 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) { | |||||
| Path: "/core/pushTaskInfo", | Path: "/core/pushTaskInfo", | ||||
| Handler: core.PushTaskInfoHandler(serverCtx), | Handler: core.PushTaskInfoHandler(serverCtx), | ||||
| }, | }, | ||||
| { | |||||
| Method: http.MethodPost, | |||||
| Path: "/core/pushResourceInfo", | |||||
| Handler: core.PushResourceInfoHandler(serverCtx), | |||||
| }, | |||||
| { | |||||
| Method: http.MethodPost, | |||||
| Path: "/core/pushNotice", | |||||
| Handler: core.PushNoticeHandler(serverCtx), | |||||
| }, | |||||
| { | |||||
| Method: http.MethodGet, | |||||
| Path: "/core/listNotice", | |||||
| Handler: core.ListNoticeHandler(serverCtx), | |||||
| }, | |||||
| { | { | ||||
| Method: http.MethodGet, | Method: http.MethodGet, | ||||
| Path: "/core/task/list", | Path: "/core/task/list", | ||||
| @@ -171,6 +186,11 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) { | |||||
| Path: "/hpc/job", | Path: "/hpc/job", | ||||
| Handler: hpc.JobHandler(serverCtx), | Handler: hpc.JobHandler(serverCtx), | ||||
| }, | }, | ||||
| { | |||||
| Method: http.MethodGet, | |||||
| Path: "/hpc/resource", | |||||
| Handler: hpc.ResourceHandler(serverCtx), | |||||
| }, | |||||
| { | { | ||||
| Method: http.MethodGet, | Method: http.MethodGet, | ||||
| Path: "/hpc/queueAssets", | Path: "/hpc/queueAssets", | ||||
| @@ -1107,7 +1127,7 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) { | |||||
| }, | }, | ||||
| { | { | ||||
| Method: http.MethodGet, | Method: http.MethodGet, | ||||
| Path: "/schedule/ai/getDatasets", | |||||
| Path: "/schedule/ai/getDatasets/:adapterId", | |||||
| Handler: schedule.ScheduleGetDatasetsHandler(serverCtx), | Handler: schedule.ScheduleGetDatasetsHandler(serverCtx), | ||||
| }, | }, | ||||
| { | { | ||||
| @@ -1117,7 +1137,7 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) { | |||||
| }, | }, | ||||
| { | { | ||||
| Method: http.MethodGet, | Method: http.MethodGet, | ||||
| Path: "/schedule/ai/getAlgorithms/:resourceType/:taskType/:dataset", | |||||
| Path: "/schedule/ai/getAlgorithms/:adapterId/:resourceType/:taskType/:dataset", | |||||
| Handler: schedule.ScheduleGetAlgorithmsHandler(serverCtx), | Handler: schedule.ScheduleGetAlgorithmsHandler(serverCtx), | ||||
| }, | }, | ||||
| { | { | ||||
| @@ -1,16 +1,24 @@ | |||||
| package schedule | package schedule | ||||
| import ( | import ( | ||||
| "github.com/zeromicro/go-zero/rest/httpx" | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/schedule" | "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/schedule" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" | "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" | "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" | ||||
| "net/http" | "net/http" | ||||
| ) | ) | ||||
| func ScheduleGetDatasetsHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { | func ScheduleGetDatasetsHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { | ||||
| return func(w http.ResponseWriter, r *http.Request) { | return func(w http.ResponseWriter, r *http.Request) { | ||||
| var req types.AiDatasetsReq | |||||
| if err := httpx.Parse(r, &req); err != nil { | |||||
| result.ParamErrorResult(r, w, err) | |||||
| return | |||||
| } | |||||
| l := schedule.NewScheduleGetDatasetsLogic(r.Context(), svcCtx) | l := schedule.NewScheduleGetDatasetsLogic(r.Context(), svcCtx) | ||||
| resp, err := l.ScheduleGetDatasets() | |||||
| resp, err := l.ScheduleGetDatasets(&req) | |||||
| result.HttpResult(r, w, resp, err) | result.HttpResult(r, w, resp, err) | ||||
| } | } | ||||
| } | } | ||||
| @@ -2,13 +2,12 @@ package core | |||||
| import ( | import ( | ||||
| "context" | "context" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/mqs" | |||||
| "fmt" | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" | "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" | "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/pkg/response" | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" | "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" | "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" | ||||
| tool "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" | |||||
| "math/rand" | |||||
| "time" | "time" | ||||
| "github.com/zeromicro/go-zero/core/logx" | "github.com/zeromicro/go-zero/core/logx" | ||||
| @@ -35,7 +34,6 @@ func (l *CommitVmTaskLogic) CommitVmTask(req *types.CommitVmTaskReq) (resp *type | |||||
| Status: constants.Saved, | Status: constants.Saved, | ||||
| Name: req.Name, | Name: req.Name, | ||||
| CommitTime: time.Now(), | CommitTime: time.Now(), | ||||
| NsID: req.NsID, | |||||
| } | } | ||||
| // Save task data to database | // Save task data to database | ||||
| tx := l.svcCtx.DbEngin.Create(&taskModel) | tx := l.svcCtx.DbEngin.Create(&taskModel) | ||||
| @@ -43,28 +41,38 @@ func (l *CommitVmTaskLogic) CommitVmTask(req *types.CommitVmTaskReq) (resp *type | |||||
| return nil, tx.Error | return nil, tx.Error | ||||
| } | } | ||||
| var clusterIds []int64 | |||||
| l.svcCtx.DbEngin.Raw("SELECT id FROM `t_cluster` where adapter_id = ? and label = ?", req.AdapterId, req.ClusterType).Scan(&clusterIds) | |||||
| for _, CreateMulServer := range req.CreateMulServer { | |||||
| fmt.Println("", req.CreateMulServer) | |||||
| var clusterIds []int64 | |||||
| l.svcCtx.DbEngin.Raw("SELECT id FROM `t_cluster` where adapter_id = ? and label = ?", req.AdapterId, req.ClusterType).Scan(&clusterIds) | |||||
| if len(clusterIds) == 0 || clusterIds == nil { | |||||
| return nil, nil | |||||
| } | |||||
| if len(clusterIds) == 0 || clusterIds == nil { | |||||
| return nil, nil | |||||
| } | |||||
| vm := models.Vm{} | |||||
| tool.Convert(req, &vm) | |||||
| mqInfo := response.TaskInfo{ | |||||
| TaskId: taskModel.Id, | |||||
| TaskType: "vm", | |||||
| MatchLabels: req.MatchLabels, | |||||
| NsID: req.NsID, | |||||
| } | |||||
| //req.TaskId = taskModel.Id | |||||
| mqs.InsQueue.Beta.Add(&mqInfo) | |||||
| tx = l.svcCtx.DbEngin.Create(&mqInfo) | |||||
| resp = &types.CommitVmTaskResp{ | |||||
| Code: 200, | |||||
| Msg: "success", | |||||
| TaskId: taskModel.Id, | |||||
| vmInfo := models.TaskVm{ | |||||
| TaskId: taskModel.Id, | |||||
| ClusterId: clusterIds[rand.Intn(len(clusterIds))], | |||||
| Name: taskModel.Name, | |||||
| Status: "Saved", | |||||
| StartTime: time.Now().String(), | |||||
| MinCount: CreateMulServer.Min_count, | |||||
| ImageRef: CreateMulServer.ImageRef, | |||||
| FlavorRef: CreateMulServer.FlavorRef, | |||||
| Uuid: CreateMulServer.Uuid, | |||||
| Platform: CreateMulServer.Platform, | |||||
| } | |||||
| tx = l.svcCtx.DbEngin.Create(&vmInfo) | |||||
| if tx.Error != nil { | |||||
| return nil, tx.Error | |||||
| } | |||||
| resp = &types.CommitVmTaskResp{ | |||||
| Code: 200, | |||||
| Msg: "success", | |||||
| TaskId: taskModel.Id, | |||||
| } | |||||
| } | } | ||||
| return resp, nil | return resp, nil | ||||
| } | } | ||||
| @@ -0,0 +1,36 @@ | |||||
| package core | |||||
| import ( | |||||
| "context" | |||||
| "github.com/zeromicro/go-zero/core/logx" | |||||
| clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/api/client" | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" | |||||
| ) | |||||
| type ListNoticeLogic struct { | |||||
| logx.Logger | |||||
| ctx context.Context | |||||
| svcCtx *svc.ServiceContext | |||||
| } | |||||
| func NewListNoticeLogic(ctx context.Context, svcCtx *svc.ServiceContext) *ListNoticeLogic { | |||||
| return &ListNoticeLogic{ | |||||
| Logger: logx.WithContext(ctx), | |||||
| ctx: ctx, | |||||
| svcCtx: svcCtx, | |||||
| } | |||||
| } | |||||
| func (l *ListNoticeLogic) ListNotice(req *clientCore.ListNoticeReq) (*clientCore.ListNoticeResp, error) { | |||||
| var notices []clientCore.NoticeInfo | |||||
| var resp clientCore.ListNoticeResp | |||||
| l.svcCtx.DbEngin.Raw("select * from t_notice order by created_time desc").Scan(¬ices) | |||||
| for _, notice := range notices { | |||||
| resp.Data = append(resp.Data, notice) | |||||
| } | |||||
| resp.Code = 200 | |||||
| resp.Msg = "success" | |||||
| return &resp, nil | |||||
| } | |||||
| @@ -67,6 +67,13 @@ func (l *PullTaskInfoLogic) PullTaskInfo(req *clientCore.PullTaskInfoReq) (*clie | |||||
| return nil, err | return nil, err | ||||
| } | } | ||||
| utils.Convert(aiModelList, &resp.AiInfoList) | utils.Convert(aiModelList, &resp.AiInfoList) | ||||
| case 3: | |||||
| var vmModelList []models.TaskVm | |||||
| err := findModelList(req.AdapterId, l.svcCtx.DbEngin, &vmModelList) | |||||
| if err != nil { | |||||
| return nil, err | |||||
| } | |||||
| utils.Convert(vmModelList, &resp.VmInfoList) | |||||
| } | } | ||||
| return &resp, nil | return &resp, nil | ||||
| } | } | ||||
| @@ -0,0 +1,31 @@ | |||||
| package core | |||||
| import ( | |||||
| "context" | |||||
| "github.com/zeromicro/go-zero/core/logx" | |||||
| clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/api/client" | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" | |||||
| ) | |||||
| type PushNoticeLogic struct { | |||||
| logx.Logger | |||||
| ctx context.Context | |||||
| svcCtx *svc.ServiceContext | |||||
| } | |||||
| func NewPushNoticeLogic(ctx context.Context, svcCtx *svc.ServiceContext) *PushNoticeLogic { | |||||
| return &PushNoticeLogic{ | |||||
| Logger: logx.WithContext(ctx), | |||||
| ctx: ctx, | |||||
| svcCtx: svcCtx, | |||||
| } | |||||
| } | |||||
| func (l *PushNoticeLogic) PushNotice(req *clientCore.PushNoticeReq) (resp *clientCore.PushNoticeResp, err error) { | |||||
| result := l.svcCtx.DbEngin.Table("t_notice").Create(&req.NoticeInfo) | |||||
| if result.Error != nil { | |||||
| return nil, result.Error | |||||
| } | |||||
| return | |||||
| } | |||||
| @@ -0,0 +1,28 @@ | |||||
| package core | |||||
| import ( | |||||
| "context" | |||||
| clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/api/client" | |||||
| "github.com/zeromicro/go-zero/core/logx" | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" | |||||
| ) | |||||
| type PushResourceInfoLogic struct { | |||||
| logx.Logger | |||||
| ctx context.Context | |||||
| svcCtx *svc.ServiceContext | |||||
| } | |||||
| func NewPushResourceInfoLogic(ctx context.Context, svcCtx *svc.ServiceContext) *PushResourceInfoLogic { | |||||
| return &PushResourceInfoLogic{ | |||||
| Logger: logx.WithContext(ctx), | |||||
| ctx: ctx, | |||||
| svcCtx: svcCtx, | |||||
| } | |||||
| } | |||||
| func (l *PushResourceInfoLogic) PushResourceInfo(req *clientCore.PushResourceInfoReq) (resp *clientCore.PushResourceInfoResp, err error) { | |||||
| return | |||||
| } | |||||
| @@ -49,6 +49,12 @@ func (l *PushTaskInfoLogic) PushTaskInfo(req *clientCore.PushTaskInfoReq) (*clie | |||||
| aiInfo.Status, aiInfo.StartTime, aiInfo.ProjectId, aiInfo.JobId, req.AdapterId, aiInfo.TaskId, aiInfo.Name) | aiInfo.Status, aiInfo.StartTime, aiInfo.ProjectId, aiInfo.JobId, req.AdapterId, aiInfo.TaskId, aiInfo.Name) | ||||
| syncTask(l.svcCtx.DbEngin, aiInfo.TaskId) | syncTask(l.svcCtx.DbEngin, aiInfo.TaskId) | ||||
| } | } | ||||
| case 3: | |||||
| for _, vmInfo := range req.VmInfoList { | |||||
| l.svcCtx.DbEngin.Exec("update task_vm set status = ?,start_time = ? where participant_id = ? and task_id = ? and name = ?", | |||||
| vmInfo.Status, vmInfo.StartTime, req.AdapterId, vmInfo.TaskId, vmInfo.Name) | |||||
| syncTask(l.svcCtx.DbEngin, vmInfo.TaskId) | |||||
| } | |||||
| } | } | ||||
| return &resp, nil | return &resp, nil | ||||
| @@ -0,0 +1,48 @@ | |||||
| package hpc | |||||
| import ( | |||||
| "context" | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" | |||||
| "github.com/zeromicro/go-zero/core/logx" | |||||
| ) | |||||
| type ResourceLogic struct { | |||||
| logx.Logger | |||||
| ctx context.Context | |||||
| svcCtx *svc.ServiceContext | |||||
| } | |||||
| func NewResourceLogic(ctx context.Context, svcCtx *svc.ServiceContext) *ResourceLogic { | |||||
| return &ResourceLogic{ | |||||
| Logger: logx.WithContext(ctx), | |||||
| ctx: ctx, | |||||
| svcCtx: svcCtx, | |||||
| } | |||||
| } | |||||
| func (l *ResourceLogic) Resource(req *types.HpcResourceReq) (resp *types.HpcResourceResp, err error) { | |||||
| l.svcCtx.DbEngin.Raw("SELECT th.NAME as job_name,t.description as job_desc,t.commit_time as submit_time,th.STATUS as job_status,ta.name as adapter_name,tc.name as cluster_name,tc.label as cluster_type FROM task_hpc th LEFT JOIN task t ON t.id = th.task_id JOIN t_cluster tc on th.cluster_id = tc.id JOIN t_adapter ta on tc.adapter_id = ta.id") | |||||
| hpcResource := types.HPCResource{ | |||||
| GPUCardsTotal: 0, | |||||
| CPUCoresTotal: 0, | |||||
| RAMTotal: 0, | |||||
| GPUCardsUsed: 0, | |||||
| CPUCoresUsed: 0, | |||||
| RAMUsed: 0, | |||||
| GPURate: 0, | |||||
| CPURate: 0, | |||||
| RAMRate: 0, | |||||
| } | |||||
| resp = &types.HpcResourceResp{ | |||||
| Code: 200, | |||||
| Msg: "success", | |||||
| HPCResource: hpcResource, | |||||
| } | |||||
| return resp, nil | |||||
| } | |||||
| @@ -26,7 +26,7 @@ func NewScheduleGetAlgorithmsLogic(ctx context.Context, svcCtx *svc.ServiceConte | |||||
| func (l *ScheduleGetAlgorithmsLogic) ScheduleGetAlgorithms(req *types.AiAlgorithmsReq) (resp *types.AiAlgorithmsResp, err error) { | func (l *ScheduleGetAlgorithmsLogic) ScheduleGetAlgorithms(req *types.AiAlgorithmsReq) (resp *types.AiAlgorithmsResp, err error) { | ||||
| resp = &types.AiAlgorithmsResp{} | resp = &types.AiAlgorithmsResp{} | ||||
| algorithms, err := storeLink.GetAlgorithms(l.ctx, l.svcCtx.Scheduler.ResourceCollector, req.ResourceType, req.TaskType, req.Dataset) | |||||
| algorithms, err := storeLink.GetAlgorithms(l.ctx, l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[req.AdapterId], req.ResourceType, req.TaskType, req.Dataset) | |||||
| if err != nil { | if err != nil { | ||||
| return nil, err | return nil, err | ||||
| } | } | ||||
| @@ -3,6 +3,7 @@ package schedule | |||||
| import ( | import ( | ||||
| "context" | "context" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/storeLink" | "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/storeLink" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" | "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" | "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" | ||||
| @@ -23,9 +24,9 @@ func NewScheduleGetDatasetsLogic(ctx context.Context, svcCtx *svc.ServiceContext | |||||
| } | } | ||||
| } | } | ||||
| func (l *ScheduleGetDatasetsLogic) ScheduleGetDatasets() (resp *types.AiDatasetsResp, err error) { | |||||
| func (l *ScheduleGetDatasetsLogic) ScheduleGetDatasets(req *types.AiDatasetsReq) (resp *types.AiDatasetsResp, err error) { | |||||
| resp = &types.AiDatasetsResp{} | resp = &types.AiDatasetsResp{} | ||||
| names, err := storeLink.GetDatasetsNames(l.ctx, l.svcCtx.Scheduler.ResourceCollector) | |||||
| names, err := storeLink.GetDatasetsNames(l.ctx, l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[req.AdapterId]) | |||||
| if err != nil { | if err != nil { | ||||
| return nil, err | return nil, err | ||||
| } | } | ||||
| @@ -27,6 +27,7 @@ func NewScheduleSubmitLogic(ctx context.Context, svcCtx *svc.ServiceContext) *Sc | |||||
| func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *types.ScheduleResp, err error) { | func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *types.ScheduleResp, err error) { | ||||
| resp = &types.ScheduleResp{} | resp = &types.ScheduleResp{} | ||||
| opt := &option.AiOption{ | opt := &option.AiOption{ | ||||
| AdapterId: req.AiOption.AdapterId, | |||||
| ResourceType: req.AiOption.ResourceType, | ResourceType: req.AiOption.ResourceType, | ||||
| Tops: req.AiOption.Tops, | Tops: req.AiOption.Tops, | ||||
| TaskType: req.AiOption.TaskType, | TaskType: req.AiOption.TaskType, | ||||
| @@ -55,6 +56,7 @@ func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *type | |||||
| scheResult := &types.ScheduleResult{} | scheResult := &types.ScheduleResult{} | ||||
| scheResult.ClusterId = r.ClusterId | scheResult.ClusterId = r.ClusterId | ||||
| scheResult.TaskId = r.TaskId | scheResult.TaskId = r.TaskId | ||||
| scheResult.Strategy = r.Strategy | |||||
| scheResult.Replica = r.Replica | scheResult.Replica = r.Replica | ||||
| scheResult.Msg = r.Msg | scheResult.Msg = r.Msg | ||||
| resp.Results = append(resp.Results, scheResult) | resp.Results = append(resp.Results, scheResult) | ||||
| @@ -5,9 +5,8 @@ import ( | |||||
| ) | ) | ||||
| type Weight struct { | type Weight struct { | ||||
| Id int64 | |||||
| Id string | |||||
| Weight int32 | Weight int32 | ||||
| Name string | |||||
| Replica int32 | Replica int32 | ||||
| } | } | ||||
| @@ -33,6 +33,21 @@ func (s *AiStorage) GetClustersByAdapterId(id string) (*types.ClusterListResp, e | |||||
| return &resp, nil | return &resp, nil | ||||
| } | } | ||||
| func (s *AiStorage) GetAdapterIdsByType(adapterType string) ([]string, error) { | |||||
| var list []types.AdapterInfo | |||||
| var ids []string | |||||
| db := s.DbEngin.Model(&types.AdapterInfo{}).Table("t_adapter") | |||||
| db = db.Where("type = ?", adapterType) | |||||
| err := db.Order("create_time desc").Find(&list).Error | |||||
| if err != nil { | |||||
| return nil, err | |||||
| } | |||||
| for _, info := range list { | |||||
| ids = append(ids, info.Id) | |||||
| } | |||||
| return ids, nil | |||||
| } | |||||
| func (s *AiStorage) SaveTask(name string) error { | func (s *AiStorage) SaveTask(name string) error { | ||||
| // 构建主任务结构体 | // 构建主任务结构体 | ||||
| taskModel := models.Task{ | taskModel := models.Task{ | ||||
| @@ -20,8 +20,7 @@ import ( | |||||
| "github.com/zeromicro/go-zero/core/logx" | "github.com/zeromicro/go-zero/core/logx" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/common" | "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/common" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/database" | "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/database" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/collector" | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/executor" | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service" | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/strategy" | "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/strategy" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/pkg/response" | "gitlink.org.cn/JointCloud/pcm-coordinator/api/pkg/response" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/rpc/client/participantservice" | "gitlink.org.cn/JointCloud/pcm-coordinator/rpc/client/participantservice" | ||||
| @@ -32,16 +31,15 @@ import ( | |||||
| ) | ) | ||||
| type Scheduler struct { | type Scheduler struct { | ||||
| task *response.TaskInfo | |||||
| participantIds []int64 | |||||
| subSchedule SubSchedule | |||||
| dbEngin *gorm.DB | |||||
| result []string //pID:子任务yamlstring 键值对 | |||||
| participantRpc participantservice.ParticipantService | |||||
| ResourceCollector *map[string]collector.AiCollector | |||||
| AiStorages *database.AiStorage | |||||
| AiExecutor *map[string]executor.AiExecutor | |||||
| mu sync.RWMutex | |||||
| task *response.TaskInfo | |||||
| participantIds []int64 | |||||
| subSchedule SubSchedule | |||||
| dbEngin *gorm.DB | |||||
| result []string //pID:子任务yamlstring 键值对 | |||||
| participantRpc participantservice.ParticipantService | |||||
| AiStorages *database.AiStorage | |||||
| AiService *service.AiService | |||||
| mu sync.RWMutex | |||||
| } | } | ||||
| type SubSchedule interface { | type SubSchedule interface { | ||||
| @@ -59,8 +57,8 @@ func NewScheduler(subSchedule SubSchedule, val string, dbEngin *gorm.DB, partici | |||||
| return &Scheduler{task: task, subSchedule: subSchedule, dbEngin: dbEngin, participantRpc: participantRpc}, nil | return &Scheduler{task: task, subSchedule: subSchedule, dbEngin: dbEngin, participantRpc: participantRpc}, nil | ||||
| } | } | ||||
| func NewSchdlr(resourceCollector *map[string]collector.AiCollector, storages *database.AiStorage, aiExecutor *map[string]executor.AiExecutor) *Scheduler { | |||||
| return &Scheduler{ResourceCollector: resourceCollector, AiStorages: storages, AiExecutor: aiExecutor} | |||||
| func NewSchdlr(aiService *service.AiService, storages *database.AiStorage) *Scheduler { | |||||
| return &Scheduler{AiService: aiService, AiStorages: storages} | |||||
| } | } | ||||
| func (s *Scheduler) SpecifyClusters() { | func (s *Scheduler) SpecifyClusters() { | ||||
| @@ -18,6 +18,7 @@ import ( | |||||
| "context" | "context" | ||||
| "encoding/json" | "encoding/json" | ||||
| "errors" | "errors" | ||||
| "fmt" | |||||
| "gitlink.org.cn/JointCloud/pcm-ac/hpcAC" | "gitlink.org.cn/JointCloud/pcm-ac/hpcAC" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler" | "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option" | "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option" | ||||
| @@ -28,7 +29,6 @@ import ( | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" | "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" | "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" | ||||
| "gitlink.org.cn/JointCloud/pcm-octopus/octopus" | "gitlink.org.cn/JointCloud/pcm-octopus/octopus" | ||||
| "strconv" | |||||
| "sync" | "sync" | ||||
| ) | ) | ||||
| @@ -43,6 +43,7 @@ type AiScheduler struct { | |||||
| type AiResult struct { | type AiResult struct { | ||||
| TaskId string | TaskId string | ||||
| ClusterId string | ClusterId string | ||||
| Strategy string | |||||
| Replica int32 | Replica int32 | ||||
| Msg string | Msg string | ||||
| } | } | ||||
| @@ -63,9 +64,8 @@ func (as *AiScheduler) GetNewStructForDb(task *response.TaskInfo, resource strin | |||||
| } | } | ||||
| func (as *AiScheduler) PickOptimalStrategy() (strategy.Strategy, error) { | func (as *AiScheduler) PickOptimalStrategy() (strategy.Strategy, error) { | ||||
| if as.option.AiClusterId != "" { | |||||
| // TODO database operation Find | |||||
| return &strategy.SingleAssignment{Cluster: &strategy.AssignedCluster{ParticipantId: 0, Name: "", Replicas: 1}}, nil | |||||
| if len(as.option.ClusterIds) == 1 { | |||||
| return &strategy.SingleAssignment{Cluster: &strategy.AssignedCluster{ClusterId: as.option.ClusterIds[0], Replicas: 1}}, nil | |||||
| } | } | ||||
| resources, err := as.findClustersWithResources() | resources, err := as.findClustersWithResources() | ||||
| @@ -79,8 +79,7 @@ func (as *AiScheduler) PickOptimalStrategy() (strategy.Strategy, error) { | |||||
| if len(resources) == 1 { | if len(resources) == 1 { | ||||
| var cluster strategy.AssignedCluster | var cluster strategy.AssignedCluster | ||||
| cluster.ParticipantId = resources[0].ParticipantId | |||||
| cluster.Name = resources[0].Name | |||||
| cluster.ClusterId = resources[0].ClusterId | |||||
| cluster.Replicas = 1 | cluster.Replicas = 1 | ||||
| return &strategy.SingleAssignment{Cluster: &cluster}, nil | return &strategy.SingleAssignment{Cluster: &cluster}, nil | ||||
| } | } | ||||
| @@ -89,7 +88,11 @@ func (as *AiScheduler) PickOptimalStrategy() (strategy.Strategy, error) { | |||||
| switch as.option.StrategyName { | switch as.option.StrategyName { | ||||
| case strategy.REPLICATION: | case strategy.REPLICATION: | ||||
| strategy := strategy.NewReplicationStrategy(¶m.ReplicationParams{Params: params, Replicas: 1}) | |||||
| var clusterIds []string | |||||
| for _, resource := range resources { | |||||
| clusterIds = append(clusterIds, resource.ClusterId) | |||||
| } | |||||
| strategy := strategy.NewReplicationStrategy(clusterIds, 1) | |||||
| return strategy, nil | return strategy, nil | ||||
| case strategy.RESOURCES_PRICING: | case strategy.RESOURCES_PRICING: | ||||
| strategy := strategy.NewPricingStrategy(¶m.ResourcePricingParams{Params: params, Replicas: 1}) | strategy := strategy.NewPricingStrategy(¶m.ResourcePricingParams{Params: params, Replicas: 1}) | ||||
| @@ -111,32 +114,47 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) (interfa | |||||
| return nil, errors.New("clusters is nil") | return nil, errors.New("clusters is nil") | ||||
| } | } | ||||
| for i := len(clusters) - 1; i >= 0; i-- { | |||||
| if clusters[i].Replicas == 0 { | |||||
| clusters = append(clusters[:i], clusters[i+1:]...) | |||||
| } | |||||
| } | |||||
| if len(clusters) == 0 { | |||||
| return nil, errors.New("clusters is nil") | |||||
| } | |||||
| var wg sync.WaitGroup | var wg sync.WaitGroup | ||||
| var results []*AiResult | var results []*AiResult | ||||
| var errs []error | |||||
| var errs []interface{} | |||||
| var ch = make(chan *AiResult, len(clusters)) | var ch = make(chan *AiResult, len(clusters)) | ||||
| var errCh = make(chan error, len(clusters)) | |||||
| var errCh = make(chan interface{}, len(clusters)) | |||||
| executorMap := *as.AiExecutor | |||||
| executorMap := as.AiService.AiExecutorAdapterMap[as.option.AdapterId] | |||||
| for _, cluster := range clusters { | for _, cluster := range clusters { | ||||
| c := cluster | c := cluster | ||||
| if cluster.Replicas == 0 { | |||||
| continue | |||||
| } | |||||
| wg.Add(1) | wg.Add(1) | ||||
| go func() { | go func() { | ||||
| opt, _ := cloneAiOption(as.option) | opt, _ := cloneAiOption(as.option) | ||||
| resp, err := executorMap[c.Name].Execute(as.ctx, opt) | |||||
| resp, err := executorMap[c.ClusterId].Execute(as.ctx, opt) | |||||
| if err != nil { | if err != nil { | ||||
| errCh <- err | |||||
| e := struct { | |||||
| err error | |||||
| clusterId string | |||||
| }{ | |||||
| err: err, | |||||
| clusterId: c.ClusterId, | |||||
| } | |||||
| errCh <- e | |||||
| wg.Done() | wg.Done() | ||||
| return | return | ||||
| } | } | ||||
| result, _ := convertType(resp) | result, _ := convertType(resp) | ||||
| result.Replica = c.Replicas | result.Replica = c.Replicas | ||||
| result.ClusterId = strconv.FormatInt(c.ParticipantId, 10) | |||||
| result.ClusterId = c.ClusterId | |||||
| result.Strategy = as.option.StrategyName | |||||
| ch <- result | ch <- result | ||||
| wg.Done() | wg.Done() | ||||
| @@ -150,10 +168,29 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) (interfa | |||||
| errs = append(errs, e) | errs = append(errs, e) | ||||
| } | } | ||||
| if len(errs) != 0 { | |||||
| if len(errs) == len(clusters) { | |||||
| return nil, errors.New("submit task failed") | return nil, errors.New("submit task failed") | ||||
| } | } | ||||
| if len(errs) != 0 { | |||||
| var msg string | |||||
| for _, err := range errs { | |||||
| e := (err).(struct { | |||||
| err error | |||||
| clusterId string | |||||
| }) | |||||
| msg += fmt.Sprintf("clusterId: %v , error: %v \n", e.clusterId, e.err.Error()) | |||||
| } | |||||
| for s := range ch { | |||||
| if s.Msg != "" { | |||||
| msg += fmt.Sprintf("clusterId: %v , error: %v \n", s.ClusterId, s.Msg) | |||||
| } else { | |||||
| msg += fmt.Sprintf("clusterId: %v , submitted successfully, taskId: %v \n", s.ClusterId, s.TaskId) | |||||
| } | |||||
| } | |||||
| return nil, errors.New(msg) | |||||
| } | |||||
| for s := range ch { | for s := range ch { | ||||
| // TODO: database operation | // TODO: database operation | ||||
| results = append(results, s) | results = append(results, s) | ||||
| @@ -164,19 +201,28 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) (interfa | |||||
| func (as *AiScheduler) findClustersWithResources() ([]*collector.ResourceStats, error) { | func (as *AiScheduler) findClustersWithResources() ([]*collector.ResourceStats, error) { | ||||
| var wg sync.WaitGroup | var wg sync.WaitGroup | ||||
| var ch = make(chan *collector.ResourceStats, len(*as.ResourceCollector)) | |||||
| var errCh = make(chan error, len(*as.ResourceCollector)) | |||||
| var clustersNum = len(as.AiService.AiCollectorAdapterMap[as.option.AdapterId]) | |||||
| var ch = make(chan *collector.ResourceStats, clustersNum) | |||||
| var errCh = make(chan interface{}, clustersNum) | |||||
| var resourceSpecs []*collector.ResourceStats | var resourceSpecs []*collector.ResourceStats | ||||
| var errs []error | |||||
| var errs []interface{} | |||||
| for _, resourceCollector := range *as.ResourceCollector { | |||||
| for s, resourceCollector := range as.AiService.AiCollectorAdapterMap[as.option.AdapterId] { | |||||
| wg.Add(1) | wg.Add(1) | ||||
| rc := resourceCollector | rc := resourceCollector | ||||
| id := s | |||||
| go func() { | go func() { | ||||
| spec, err := rc.GetResourceStats(as.ctx) | spec, err := rc.GetResourceStats(as.ctx) | ||||
| if err != nil { | if err != nil { | ||||
| errCh <- err | |||||
| e := struct { | |||||
| err error | |||||
| clusterId string | |||||
| }{ | |||||
| err: err, | |||||
| clusterId: id, | |||||
| } | |||||
| errCh <- e | |||||
| wg.Done() | wg.Done() | ||||
| return | return | ||||
| } | } | ||||
| @@ -196,13 +242,22 @@ func (as *AiScheduler) findClustersWithResources() ([]*collector.ResourceStats, | |||||
| errs = append(errs, e) | errs = append(errs, e) | ||||
| } | } | ||||
| if len(errs) != 0 { | |||||
| if len(errs) == clustersNum { | |||||
| return nil, errors.New("get resources failed") | return nil, errors.New("get resources failed") | ||||
| } | } | ||||
| if len(resourceSpecs) == 0 { | |||||
| return nil, errors.New("no resource found") | |||||
| if len(errs) != 0 { | |||||
| var msg string | |||||
| for _, err := range errs { | |||||
| e := (err).(struct { | |||||
| err error | |||||
| clusterId string | |||||
| }) | |||||
| msg += fmt.Sprintf("clusterId: %v , error: %v \n", e.clusterId, e.err.Error()) | |||||
| } | |||||
| return nil, errors.New(msg) | |||||
| } | } | ||||
| return resourceSpecs, nil | return resourceSpecs, nil | ||||
| } | } | ||||
| @@ -1,7 +1,8 @@ | |||||
| package option | package option | ||||
| type AiOption struct { | type AiOption struct { | ||||
| AiClusterId string // shuguangAi /octopus ClusterId | |||||
| AdapterId string | |||||
| ClusterIds []string | |||||
| TaskName string | TaskName string | ||||
| ResourceType string // cpu/gpu/compute card | ResourceType string // cpu/gpu/compute card | ||||
| CpuCoreNum int64 | CpuCoreNum int64 | ||||
| @@ -1,11 +1,14 @@ | |||||
| package service | package service | ||||
| import ( | import ( | ||||
| "github.com/zeromicro/go-zero/zrpc" | |||||
| "gitlink.org.cn/JointCloud/pcm-ac/hpcacclient" | "gitlink.org.cn/JointCloud/pcm-ac/hpcacclient" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/config" | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/database" | "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/database" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/collector" | "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/collector" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/executor" | "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/executor" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/storeLink" | "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/storeLink" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" | |||||
| "gitlink.org.cn/JointCloud/pcm-octopus/octopusclient" | "gitlink.org.cn/JointCloud/pcm-octopus/octopusclient" | ||||
| "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/client/imagesservice" | "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/client/imagesservice" | ||||
| "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/client/modelartsservice" | "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/client/modelartsservice" | ||||
| @@ -18,30 +21,60 @@ const ( | |||||
| SHUGUANGAI = "shuguangAi" | SHUGUANGAI = "shuguangAi" | ||||
| ) | ) | ||||
| func InitAiClusterMap(octopusRpc octopusclient.Octopus, modelArtsRpc modelartsservice.ModelArtsService, modelArtsImgRpc imagesservice.ImagesService, aCRpc hpcacclient.HpcAC, storages *database.AiStorage) (*map[string]executor.AiExecutor, *map[string]collector.AiCollector) { | |||||
| clusters, _ := storages.GetClustersByAdapterId("1777144940459986944") | |||||
| type AiService struct { | |||||
| AiExecutorAdapterMap map[string]map[string]executor.AiExecutor | |||||
| AiCollectorAdapterMap map[string]map[string]collector.AiCollector | |||||
| } | |||||
| func NewAiService(conf *config.Config, storages *database.AiStorage) (*AiService, error) { | |||||
| var aiType = "1" | |||||
| adapterIds, err := storages.GetAdapterIdsByType(aiType) | |||||
| if err != nil { | |||||
| return nil, err | |||||
| } | |||||
| aiService := &AiService{ | |||||
| AiExecutorAdapterMap: make(map[string]map[string]executor.AiExecutor), | |||||
| AiCollectorAdapterMap: make(map[string]map[string]collector.AiCollector), | |||||
| } | |||||
| for _, id := range adapterIds { | |||||
| clusters, err := storages.GetClustersByAdapterId(id) | |||||
| if err != nil { | |||||
| return nil, err | |||||
| } | |||||
| exeClusterMap, colClusterMap := InitAiClusterMap(conf, clusters.List) | |||||
| aiService.AiExecutorAdapterMap[id] = exeClusterMap | |||||
| aiService.AiCollectorAdapterMap[id] = colClusterMap | |||||
| } | |||||
| return aiService, nil | |||||
| } | |||||
| func InitAiClusterMap(conf *config.Config, clusters []types.ClusterInfo) (map[string]executor.AiExecutor, map[string]collector.AiCollector) { | |||||
| executorMap := make(map[string]executor.AiExecutor) | executorMap := make(map[string]executor.AiExecutor) | ||||
| collectorMap := make(map[string]collector.AiCollector) | collectorMap := make(map[string]collector.AiCollector) | ||||
| for _, c := range clusters.List { | |||||
| for _, c := range clusters { | |||||
| switch c.Name { | switch c.Name { | ||||
| case OCTOPUS: | case OCTOPUS: | ||||
| id, _ := strconv.ParseInt(c.Id, 10, 64) | id, _ := strconv.ParseInt(c.Id, 10, 64) | ||||
| octopusRpc := octopusclient.NewOctopus(zrpc.MustNewClient(conf.OctopusRpcConf)) | |||||
| octopus := storeLink.NewOctopusLink(octopusRpc, c.Nickname, id) | octopus := storeLink.NewOctopusLink(octopusRpc, c.Nickname, id) | ||||
| collectorMap[c.Nickname] = octopus | |||||
| executorMap[c.Nickname] = octopus | |||||
| collectorMap[c.Id] = octopus | |||||
| executorMap[c.Id] = octopus | |||||
| case MODELARTS: | case MODELARTS: | ||||
| id, _ := strconv.ParseInt(c.Id, 10, 64) | id, _ := strconv.ParseInt(c.Id, 10, 64) | ||||
| modelArtsRpc := modelartsservice.NewModelArtsService(zrpc.MustNewClient(conf.ModelArtsRpcConf)) | |||||
| modelArtsImgRpc := imagesservice.NewImagesService(zrpc.MustNewClient(conf.ModelArtsImgRpcConf)) | |||||
| modelarts := storeLink.NewModelArtsLink(modelArtsRpc, modelArtsImgRpc, c.Nickname, id) | modelarts := storeLink.NewModelArtsLink(modelArtsRpc, modelArtsImgRpc, c.Nickname, id) | ||||
| collectorMap[c.Nickname] = modelarts | |||||
| executorMap[c.Nickname] = modelarts | |||||
| collectorMap[c.Id] = modelarts | |||||
| executorMap[c.Id] = modelarts | |||||
| case SHUGUANGAI: | case SHUGUANGAI: | ||||
| id, _ := strconv.ParseInt(c.Id, 10, 64) | id, _ := strconv.ParseInt(c.Id, 10, 64) | ||||
| aCRpc := hpcacclient.NewHpcAC(zrpc.MustNewClient(conf.ACRpcConf)) | |||||
| sgai := storeLink.NewShuguangAi(aCRpc, c.Nickname, id) | sgai := storeLink.NewShuguangAi(aCRpc, c.Nickname, id) | ||||
| collectorMap[c.Nickname] = sgai | |||||
| executorMap[c.Nickname] = sgai | |||||
| collectorMap[c.Id] = sgai | |||||
| executorMap[c.Id] = sgai | |||||
| } | } | ||||
| } | } | ||||
| return &executorMap, &collectorMap | |||||
| return executorMap, collectorMap | |||||
| } | } | ||||
| @@ -9,18 +9,18 @@ type AiCollector interface { | |||||
| } | } | ||||
| type ResourceStats struct { | type ResourceStats struct { | ||||
| ParticipantId int64 | |||||
| Name string | |||||
| CpuCoreAvail int64 | |||||
| CpuCoreTotal int64 | |||||
| MemAvail float64 | |||||
| MemTotal float64 | |||||
| DiskAvail float64 | |||||
| DiskTotal float64 | |||||
| GpuAvail int64 | |||||
| CardsAvail []*Card | |||||
| CpuCoreHours float64 | |||||
| Balance float64 | |||||
| ClusterId string | |||||
| Name string | |||||
| CpuCoreAvail int64 | |||||
| CpuCoreTotal int64 | |||||
| MemAvail float64 | |||||
| MemTotal float64 | |||||
| DiskAvail float64 | |||||
| DiskTotal float64 | |||||
| GpuAvail int64 | |||||
| CardsAvail []*Card | |||||
| CpuCoreHours float64 | |||||
| Balance float64 | |||||
| } | } | ||||
| type Card struct { | type Card struct { | ||||
| @@ -33,15 +33,14 @@ func (ps *DynamicResourcesStrategy) Schedule() ([]*AssignedCluster, error) { | |||||
| for _, res := range ps.resources { | for _, res := range ps.resources { | ||||
| if opt.ResourceType == "cpu" { | if opt.ResourceType == "cpu" { | ||||
| if res.CpuCoreHours <= 0 { | if res.CpuCoreHours <= 0 { | ||||
| cluster := &AssignedCluster{ParticipantId: res.ParticipantId, Name: res.Name, Replicas: ps.replicas} | |||||
| cluster := &AssignedCluster{ClusterId: res.ClusterId, Replicas: ps.replicas} | |||||
| results = append(results, cluster) | results = append(results, cluster) | ||||
| return results, nil | return results, nil | ||||
| } | } | ||||
| if res.CpuCoreHours > maxCpuCoreHoursAvailable { | if res.CpuCoreHours > maxCpuCoreHoursAvailable { | ||||
| maxCpuCoreHoursAvailable = res.CpuCoreHours | maxCpuCoreHoursAvailable = res.CpuCoreHours | ||||
| assignedCluster.Name = res.Name | |||||
| assignedCluster.ParticipantId = res.ParticipantId | |||||
| assignedCluster.ClusterId = res.ClusterId | |||||
| assignedCluster.Replicas = ps.replicas | assignedCluster.Replicas = ps.replicas | ||||
| } | } | ||||
| } | } | ||||
| @@ -56,8 +55,7 @@ func (ps *DynamicResourcesStrategy) Schedule() ([]*AssignedCluster, error) { | |||||
| } | } | ||||
| if maxCurrentCardHours > maxCardHoursAvailable { | if maxCurrentCardHours > maxCardHoursAvailable { | ||||
| maxCardHoursAvailable = maxCurrentCardHours | maxCardHoursAvailable = maxCurrentCardHours | ||||
| assignedCluster.Name = res.Name | |||||
| assignedCluster.ParticipantId = res.ParticipantId | |||||
| assignedCluster.ClusterId = res.ClusterId | |||||
| assignedCluster.Replicas = ps.replicas | assignedCluster.Replicas = ps.replicas | ||||
| } | } | ||||
| } | } | ||||
| @@ -1,23 +0,0 @@ | |||||
| package param | |||||
| import "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/entity" | |||||
| type ReplicationParams struct { | |||||
| Replicas int32 | |||||
| *Params | |||||
| } | |||||
| func (r *ReplicationParams) GetReplicas() int32 { | |||||
| return r.Replicas | |||||
| } | |||||
| func (r *ReplicationParams) GetParticipants() []*entity.Participant { | |||||
| var participants []*entity.Participant | |||||
| for _, resource := range r.Resources { | |||||
| participants = append(participants, &entity.Participant{ | |||||
| Participant_id: resource.ParticipantId, | |||||
| Name: resource.Name, | |||||
| }) | |||||
| } | |||||
| return participants | |||||
| } | |||||
| @@ -2,6 +2,7 @@ package param | |||||
| import ( | import ( | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/algorithm/providerPricing" | "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/algorithm/providerPricing" | ||||
| "strconv" | |||||
| ) | ) | ||||
| type ResourcePricingParams struct { | type ResourcePricingParams struct { | ||||
| @@ -21,8 +22,9 @@ func (r *ResourcePricingParams) GetTask() *providerPricing.Task { | |||||
| func (r *ResourcePricingParams) GetProviders() []*providerPricing.Provider { | func (r *ResourcePricingParams) GetProviders() []*providerPricing.Provider { | ||||
| var providerList []*providerPricing.Provider | var providerList []*providerPricing.Provider | ||||
| for _, resource := range r.Resources { | for _, resource := range r.Resources { | ||||
| id, _ := strconv.ParseInt(resource.ClusterId, 10, 64) | |||||
| provider := providerPricing.NewProvider( | provider := providerPricing.NewProvider( | ||||
| resource.ParticipantId, | |||||
| id, | |||||
| float64(resource.CpuCoreAvail), | float64(resource.CpuCoreAvail), | ||||
| resource.MemAvail, | resource.MemAvail, | ||||
| resource.DiskAvail, 0.0, 0.0, 0.0) | resource.DiskAvail, 0.0, 0.0, 0.0) | ||||
| @@ -2,33 +2,31 @@ package strategy | |||||
| import ( | import ( | ||||
| "errors" | "errors" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/entity" | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/strategy/param" | |||||
| ) | ) | ||||
| type ReplicationStrategy struct { | type ReplicationStrategy struct { | ||||
| replicas int32 | |||||
| participants []*entity.Participant | |||||
| replicas int32 | |||||
| clusterIds []string | |||||
| } | } | ||||
| func NewReplicationStrategy(params *param.ReplicationParams) *ReplicationStrategy { | |||||
| return &ReplicationStrategy{replicas: params.GetReplicas(), | |||||
| participants: params.GetParticipants(), | |||||
| func NewReplicationStrategy(clusterIds []string, replicas int32) *ReplicationStrategy { | |||||
| return &ReplicationStrategy{clusterIds: clusterIds, | |||||
| replicas: replicas, | |||||
| } | } | ||||
| } | } | ||||
| func (ps *ReplicationStrategy) Schedule() ([]*AssignedCluster, error) { | |||||
| if ps.replicas < 1 { | |||||
| func (r *ReplicationStrategy) Schedule() ([]*AssignedCluster, error) { | |||||
| if r.replicas < 1 { | |||||
| return nil, errors.New("replicas must be greater than 0") | return nil, errors.New("replicas must be greater than 0") | ||||
| } | } | ||||
| if ps.participants == nil { | |||||
| return nil, errors.New("participantId must be set") | |||||
| if len(r.clusterIds) == 0 { | |||||
| return nil, errors.New("clusterIds must be set") | |||||
| } | } | ||||
| var results []*AssignedCluster | var results []*AssignedCluster | ||||
| for _, p := range ps.participants { | |||||
| cluster := &AssignedCluster{ParticipantId: p.Participant_id, Name: p.Name, Replicas: ps.replicas} | |||||
| for _, c := range r.clusterIds { | |||||
| cluster := &AssignedCluster{ClusterId: c, Replicas: r.replicas} | |||||
| results = append(results, cluster) | results = append(results, cluster) | ||||
| } | } | ||||
| return results, nil | return results, nil | ||||
| @@ -18,6 +18,7 @@ import ( | |||||
| "errors" | "errors" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/algorithm/providerPricing" | "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/algorithm/providerPricing" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/strategy/param" | "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/strategy/param" | ||||
| "strconv" | |||||
| ) | ) | ||||
| type PricingStrategy struct { | type PricingStrategy struct { | ||||
| @@ -154,7 +155,7 @@ func (ps *PricingStrategy) Schedule() ([]*AssignedCluster, error) { | |||||
| if e == 0 { | if e == 0 { | ||||
| continue | continue | ||||
| } | } | ||||
| cluster := &AssignedCluster{ParticipantId: ps.ProviderList[i].Pid, Replicas: int32(e)} | |||||
| cluster := &AssignedCluster{ClusterId: strconv.FormatInt(ps.ProviderList[i].Pid, 10), Replicas: int32(e)} | |||||
| results = append(results, cluster) | results = append(results, cluster) | ||||
| } | } | ||||
| @@ -29,7 +29,7 @@ func (s *StaticWeightStrategy) Schedule() ([]*AssignedCluster, error) { | |||||
| weights := make([]*weightDistributing.Weight, 0) | weights := make([]*weightDistributing.Weight, 0) | ||||
| for k, v := range s.staticWeightMap { | for k, v := range s.staticWeightMap { | ||||
| weight := &weightDistributing.Weight{ | weight := &weightDistributing.Weight{ | ||||
| Name: k, | |||||
| Id: k, | |||||
| Weight: v, | Weight: v, | ||||
| } | } | ||||
| weights = append(weights, weight) | weights = append(weights, weight) | ||||
| @@ -39,7 +39,7 @@ func (s *StaticWeightStrategy) Schedule() ([]*AssignedCluster, error) { | |||||
| var results []*AssignedCluster | var results []*AssignedCluster | ||||
| for _, weight := range weights { | for _, weight := range weights { | ||||
| cluster := &AssignedCluster{ParticipantId: weight.Id, Name: weight.Name, Replicas: weight.Replica} | |||||
| cluster := &AssignedCluster{ClusterId: weight.Id, Replicas: weight.Replica} | |||||
| results = append(results, cluster) | results = append(results, cluster) | ||||
| } | } | ||||
| @@ -18,9 +18,8 @@ type Strategy interface { | |||||
| } | } | ||||
| type AssignedCluster struct { | type AssignedCluster struct { | ||||
| ParticipantId int64 | |||||
| Name string | |||||
| Replicas int32 | |||||
| ClusterId string | |||||
| Replicas int32 | |||||
| } | } | ||||
| func GetStrategyNames() []string { | func GetStrategyNames() []string { | ||||
| @@ -5,7 +5,6 @@ import ( | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/entity" | "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/entity" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/collector" | "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/collector" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/strategy" | "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/strategy" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/strategy/param" | |||||
| "testing" | "testing" | ||||
| ) | ) | ||||
| @@ -17,15 +16,15 @@ func TestReplication(t *testing.T) { | |||||
| } | } | ||||
| rsc := []*collector.ResourceStats{ | rsc := []*collector.ResourceStats{ | ||||
| { | { | ||||
| ParticipantId: 1, | |||||
| Name: "test1", | |||||
| ClusterId: "1", | |||||
| Name: "test1", | |||||
| }, | }, | ||||
| { | { | ||||
| ParticipantId: 1, | |||||
| Name: "test2"}, | |||||
| ClusterId: "2", | |||||
| Name: "test2"}, | |||||
| { | { | ||||
| ParticipantId: 1, | |||||
| Name: "test3"}, | |||||
| ClusterId: "3", | |||||
| Name: "test3"}, | |||||
| } | } | ||||
| tests := []struct { | tests := []struct { | ||||
| name string | name string | ||||
| @@ -47,8 +46,11 @@ func TestReplication(t *testing.T) { | |||||
| for _, tt := range tests { | for _, tt := range tests { | ||||
| t.Run(tt.name, func(t *testing.T) { | t.Run(tt.name, func(t *testing.T) { | ||||
| params := ¶m.Params{Resources: rsc} | |||||
| repl := strategy.NewReplicationStrategy(¶m.ReplicationParams{Params: params, Replicas: tt.replica}) | |||||
| var clusterIds []string | |||||
| for _, stats := range rsc { | |||||
| clusterIds = append(clusterIds, stats.ClusterId) | |||||
| } | |||||
| repl := strategy.NewReplicationStrategy(clusterIds, 0) | |||||
| schedule, err := repl.Schedule() | schedule, err := repl.Schedule() | ||||
| if err != nil { | if err != nil { | ||||
| return | return | ||||
| @@ -283,11 +283,11 @@ func (o *OctopusLink) GetResourceStats(ctx context.Context) (*collector.Resource | |||||
| } | } | ||||
| resourceStats := &collector.ResourceStats{ | resourceStats := &collector.ResourceStats{ | ||||
| ParticipantId: o.participantId, | |||||
| Name: o.platform, | |||||
| Balance: balance, | |||||
| CardsAvail: cards, | |||||
| CpuCoreHours: cpuHours, | |||||
| ClusterId: strconv.FormatInt(o.participantId, 10), | |||||
| Name: o.platform, | |||||
| Balance: balance, | |||||
| CardsAvail: cards, | |||||
| CpuCoreHours: cpuHours, | |||||
| } | } | ||||
| return resourceStats, nil | return resourceStats, nil | ||||
| @@ -26,6 +26,8 @@ import ( | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" | "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" | ||||
| "strconv" | "strconv" | ||||
| "strings" | "strings" | ||||
| "sync" | |||||
| "time" | |||||
| ) | ) | ||||
| const ( | const ( | ||||
| @@ -266,96 +268,144 @@ func (s *ShuguangAi) QuerySpecs(ctx context.Context) (interface{}, error) { | |||||
| } | } | ||||
| func (s *ShuguangAi) GetResourceStats(ctx context.Context) (*collector.ResourceStats, error) { | func (s *ShuguangAi) GetResourceStats(ctx context.Context) (*collector.ResourceStats, error) { | ||||
| //balance | |||||
| userReq := &hpcAC.GetUserInfoReq{} | |||||
| userinfo, err := s.aCRpc.GetUserInfo(ctx, userReq) | |||||
| if err != nil { | |||||
| return nil, err | |||||
| var wg sync.WaitGroup | |||||
| wg.Add(4) | |||||
| var cBalance = make(chan float64) | |||||
| var cMemTotal = make(chan float64) | |||||
| var cTotalCpu = make(chan int64) | |||||
| resourceStats := &collector.ResourceStats{ | |||||
| ClusterId: strconv.FormatInt(s.participantId, 10), | |||||
| Name: s.platform, | |||||
| } | } | ||||
| balance, _ := strconv.ParseFloat(userinfo.Data.AccountBalance, 64) | |||||
| //resource limit | |||||
| limitReq := &hpcAC.QueueReq{} | |||||
| limitResp, err := s.aCRpc.QueryUserQuotasLimit(ctx, limitReq) | |||||
| if err != nil { | |||||
| return nil, err | |||||
| dcu := &collector.Card{ | |||||
| Platform: SHUGUANGAI, | |||||
| Type: CARD, | |||||
| Name: DCU, | |||||
| TOpsAtFp16: DCU_TOPS, | |||||
| } | } | ||||
| totalCpu := limitResp.Data.AccountMaxCpu | |||||
| totalDcu := limitResp.Data.AccountMaxDcu | |||||
| //balance | |||||
| go func() { | |||||
| userReq := &hpcAC.GetUserInfoReq{} | |||||
| userinfo, err := s.aCRpc.GetUserInfo(ctx, userReq) | |||||
| if err != nil { | |||||
| return | |||||
| } | |||||
| balance, _ := strconv.ParseFloat(userinfo.Data.AccountBalance, 64) | |||||
| resourceStats.Balance = balance | |||||
| cBalance <- balance | |||||
| }() | |||||
| //resource limit | |||||
| go func() { | |||||
| limitReq := &hpcAC.QueueReq{} | |||||
| limitResp, err := s.aCRpc.QueryUserQuotasLimit(ctx, limitReq) | |||||
| if err != nil { | |||||
| wg.Done() | |||||
| return | |||||
| } | |||||
| totalCpu := limitResp.Data.AccountMaxCpu | |||||
| totalDcu := limitResp.Data.AccountMaxDcu | |||||
| dcu.CardNum = int32(totalDcu) | |||||
| resourceStats.CpuCoreTotal = totalCpu | |||||
| cTotalCpu <- totalCpu | |||||
| wg.Done() | |||||
| }() | |||||
| //disk | //disk | ||||
| //diskReq := &hpcAC.ParaStorQuotaReq{} | |||||
| //diskResp, err := s.aCRpc.ParaStorQuota(ctx, diskReq) | |||||
| //if err != nil { | |||||
| // return nil, err | |||||
| //} | |||||
| // | |||||
| //totalDisk := common.RoundFloat(diskResp.Data[0].Threshold*KB*KB*KB, 3) | |||||
| //availDisk := common.RoundFloat((diskResp.Data[0].Threshold-diskResp.Data[0].Usage)*KB*KB*KB, 3) | |||||
| go func() { | |||||
| diskReq := &hpcAC.ParaStorQuotaReq{} | |||||
| diskResp, err := s.aCRpc.ParaStorQuota(ctx, diskReq) | |||||
| if err != nil { | |||||
| wg.Done() | |||||
| return | |||||
| } | |||||
| totalDisk := common.RoundFloat(diskResp.Data[0].Threshold*KB*KB*KB, 3) | |||||
| availDisk := common.RoundFloat((diskResp.Data[0].Threshold-diskResp.Data[0].Usage)*KB*KB*KB, 3) | |||||
| resourceStats.DiskTotal = totalDisk | |||||
| resourceStats.DiskAvail = availDisk | |||||
| wg.Done() | |||||
| }() | |||||
| //memory | //memory | ||||
| nodeResp, err := s.aCRpc.GetNodeResources(ctx, nil) | |||||
| if err != nil { | |||||
| return nil, err | |||||
| } | |||||
| memSize := common.RoundFloat(float64(nodeResp.Data.MemorySize)*KB*KB, 3) // MB to BYTES | |||||
| go func() { | |||||
| nodeResp, err := s.aCRpc.GetNodeResources(ctx, nil) | |||||
| if err != nil { | |||||
| wg.Done() | |||||
| return | |||||
| } | |||||
| memSize := common.RoundFloat(float64(nodeResp.Data.MemorySize)*KB*KB, 3) // MB to BYTES | |||||
| resourceStats.MemTotal = memSize | |||||
| cMemTotal <- memSize | |||||
| wg.Done() | |||||
| }() | |||||
| //resources being occupied | //resources being occupied | ||||
| memberJobResp, err := s.aCRpc.GetMemberJobs(ctx, nil) | |||||
| if err != nil { | |||||
| return nil, err | |||||
| } | |||||
| var CpuCoreAvail int64 | |||||
| var MemAvail float64 | |||||
| if len(memberJobResp.Data) != 0 { | |||||
| CpuCoreAvail = totalCpu | |||||
| MemAvail = memSize | |||||
| } else { | |||||
| var cpuCoreUsed int64 | |||||
| var memUsed float64 | |||||
| for _, datum := range memberJobResp.Data { | |||||
| cpuCoreUsed += datum.CpuCore | |||||
| } | |||||
| memUsed = float64(cpuCoreUsed * 2 * KB * KB * KB) // 2 GB per cpu core | |||||
| if cpuCoreUsed > totalCpu { | |||||
| CpuCoreAvail = 0 | |||||
| } else { | |||||
| CpuCoreAvail = totalCpu - cpuCoreUsed | |||||
| go func() { | |||||
| memSize := <-cMemTotal | |||||
| totalCpu := <-cTotalCpu | |||||
| memberJobResp, err := s.aCRpc.GetMemberJobs(ctx, nil) | |||||
| if err != nil { | |||||
| wg.Done() | |||||
| return | |||||
| } | } | ||||
| if memUsed > memSize { | |||||
| MemAvail = 0 | |||||
| var cpuCoreAvail int64 | |||||
| var memAvail float64 | |||||
| if len(memberJobResp.Data) != 0 { | |||||
| cpuCoreAvail = totalCpu | |||||
| memAvail = memSize | |||||
| } else { | } else { | ||||
| MemAvail = memSize - memUsed | |||||
| var cpuCoreUsed int64 | |||||
| var memUsed float64 | |||||
| for _, datum := range memberJobResp.Data { | |||||
| cpuCoreUsed += datum.CpuCore | |||||
| } | |||||
| memUsed = float64(cpuCoreUsed * 2 * KB * KB * KB) // 2 GB per cpu core | |||||
| if cpuCoreUsed > totalCpu { | |||||
| cpuCoreAvail = 0 | |||||
| } else { | |||||
| cpuCoreAvail = totalCpu - cpuCoreUsed | |||||
| } | |||||
| if memUsed > memSize { | |||||
| memAvail = 0 | |||||
| } else { | |||||
| memAvail = memSize - memUsed | |||||
| } | |||||
| } | } | ||||
| } | |||||
| resourceStats.CpuCoreAvail = cpuCoreAvail | |||||
| resourceStats.MemAvail = memAvail | |||||
| wg.Done() | |||||
| }() | |||||
| //usable hours | //usable hours | ||||
| var balance float64 | |||||
| select { | |||||
| case v := <-cBalance: | |||||
| balance = v | |||||
| case <-time.After(2 * time.Second): | |||||
| return nil, errors.New("get balance rpc call failed") | |||||
| } | |||||
| var cards []*collector.Card | var cards []*collector.Card | ||||
| cardHours := common.RoundFloat(balance/DCUPRICEPERHOUR, 3) | cardHours := common.RoundFloat(balance/DCUPRICEPERHOUR, 3) | ||||
| cpuHours := common.RoundFloat(balance/CPUCOREPRICEPERHOUR, 3) | cpuHours := common.RoundFloat(balance/CPUCOREPRICEPERHOUR, 3) | ||||
| dcu := &collector.Card{ | |||||
| Platform: SHUGUANGAI, | |||||
| Type: CARD, | |||||
| Name: DCU, | |||||
| TOpsAtFp16: DCU_TOPS, | |||||
| CardHours: cardHours, | |||||
| CardNum: int32(totalDcu), | |||||
| } | |||||
| dcu.CardHours = cardHours | |||||
| resourceStats.CpuCoreHours = cpuHours | |||||
| wg.Wait() | |||||
| cards = append(cards, dcu) | cards = append(cards, dcu) | ||||
| resourceStats := &collector.ResourceStats{ | |||||
| ParticipantId: s.participantId, | |||||
| Name: s.platform, | |||||
| Balance: balance, | |||||
| CpuCoreTotal: totalCpu, | |||||
| CpuCoreAvail: CpuCoreAvail, | |||||
| //DiskTotal: totalDisk, | |||||
| //DiskAvail: availDisk, | |||||
| MemTotal: memSize, | |||||
| MemAvail: MemAvail, | |||||
| CpuCoreHours: cpuHours, | |||||
| CardsAvail: cards, | |||||
| } | |||||
| resourceStats.CardsAvail = cards | |||||
| return resourceStats, nil | return resourceStats, nil | ||||
| } | } | ||||
| @@ -16,6 +16,7 @@ package storeLink | |||||
| import ( | import ( | ||||
| "context" | "context" | ||||
| "fmt" | |||||
| "github.com/pkg/errors" | "github.com/pkg/errors" | ||||
| "gitlink.org.cn/JointCloud/pcm-ac/hpcAC" | "gitlink.org.cn/JointCloud/pcm-ac/hpcAC" | ||||
| "gitlink.org.cn/JointCloud/pcm-ac/hpcacclient" | "gitlink.org.cn/JointCloud/pcm-ac/hpcacclient" | ||||
| @@ -127,21 +128,29 @@ func GetResourceTypes() []string { | |||||
| return resourceTypes | return resourceTypes | ||||
| } | } | ||||
| func GetDatasetsNames(ctx context.Context, collectorMap *map[string]collector.AiCollector) ([]string, error) { | |||||
| func GetDatasetsNames(ctx context.Context, collectorMap map[string]collector.AiCollector) ([]string, error) { | |||||
| var wg sync.WaitGroup | var wg sync.WaitGroup | ||||
| var errCh = make(chan error, len(*collectorMap)) | |||||
| var errs []error | |||||
| var errCh = make(chan interface{}, len(collectorMap)) | |||||
| var errs []interface{} | |||||
| var names []string | var names []string | ||||
| var mu sync.Mutex | var mu sync.Mutex | ||||
| colMap := *collectorMap | |||||
| for _, col := range colMap { | |||||
| colMap := collectorMap | |||||
| for s, col := range colMap { | |||||
| wg.Add(1) | wg.Add(1) | ||||
| c := col | c := col | ||||
| id := s | |||||
| go func() { | go func() { | ||||
| var ns []string | var ns []string | ||||
| specs, err := c.GetDatasetsSpecs(ctx) | specs, err := c.GetDatasetsSpecs(ctx) | ||||
| if err != nil { | if err != nil { | ||||
| errCh <- err | |||||
| e := struct { | |||||
| err error | |||||
| clusterId string | |||||
| }{ | |||||
| err: err, | |||||
| clusterId: id, | |||||
| } | |||||
| errCh <- e | |||||
| wg.Done() | wg.Done() | ||||
| return | return | ||||
| } | } | ||||
| @@ -167,34 +176,54 @@ func GetDatasetsNames(ctx context.Context, collectorMap *map[string]collector.Ai | |||||
| wg.Wait() | wg.Wait() | ||||
| close(errCh) | close(errCh) | ||||
| if len(errs) == len(colMap) { | |||||
| return nil, errors.New("get DatasetsNames failed") | |||||
| } | |||||
| for e := range errCh { | for e := range errCh { | ||||
| errs = append(errs, e) | errs = append(errs, e) | ||||
| } | } | ||||
| if len(errs) != 0 { | if len(errs) != 0 { | ||||
| return nil, errors.New("get DatasetsNames failed") | |||||
| var msg string | |||||
| for _, err := range errs { | |||||
| e := (err).(struct { | |||||
| err error | |||||
| clusterId string | |||||
| }) | |||||
| msg += fmt.Sprintf("clusterId: %v , error: %v \n", e.clusterId, e.err.Error()) | |||||
| } | |||||
| return nil, errors.New(msg) | |||||
| } | } | ||||
| names = common.RemoveDuplicates(names) | names = common.RemoveDuplicates(names) | ||||
| return names, nil | return names, nil | ||||
| } | } | ||||
| func GetAlgorithms(ctx context.Context, collectorMap *map[string]collector.AiCollector, resourceType string, taskType string, dataset string) ([]string, error) { | |||||
| func GetAlgorithms(ctx context.Context, collectorMap map[string]collector.AiCollector, resourceType string, taskType string, dataset string) ([]string, error) { | |||||
| var names []string | var names []string | ||||
| var wg sync.WaitGroup | var wg sync.WaitGroup | ||||
| var errCh = make(chan error, len(*collectorMap)) | |||||
| var errs []error | |||||
| var errCh = make(chan interface{}, len(collectorMap)) | |||||
| var errs []interface{} | |||||
| var mu sync.Mutex | var mu sync.Mutex | ||||
| colMap := *collectorMap | |||||
| for _, col := range colMap { | |||||
| colMap := collectorMap | |||||
| for s, col := range colMap { | |||||
| wg.Add(1) | wg.Add(1) | ||||
| c := col | c := col | ||||
| id := s | |||||
| go func() { | go func() { | ||||
| var ns []string | var ns []string | ||||
| algorithms, err := c.GetAlgorithms(ctx) | algorithms, err := c.GetAlgorithms(ctx) | ||||
| if err != nil { | if err != nil { | ||||
| errCh <- err | |||||
| e := struct { | |||||
| err error | |||||
| clusterId string | |||||
| }{ | |||||
| err: err, | |||||
| clusterId: id, | |||||
| } | |||||
| errCh <- e | |||||
| wg.Done() | wg.Done() | ||||
| return | return | ||||
| } | } | ||||
| @@ -240,10 +269,22 @@ func GetAlgorithms(ctx context.Context, collectorMap *map[string]collector.AiCol | |||||
| errs = append(errs, e) | errs = append(errs, e) | ||||
| } | } | ||||
| if len(errs) != 0 { | |||||
| if len(errs) == len(colMap) { | |||||
| return nil, errors.New("get Algorithms failed") | return nil, errors.New("get Algorithms failed") | ||||
| } | } | ||||
| if len(errs) != 0 { | |||||
| var msg string | |||||
| for _, err := range errs { | |||||
| e := (err).(struct { | |||||
| err error | |||||
| clusterId string | |||||
| }) | |||||
| msg += fmt.Sprintf("clusterId: %v , error: %v \n", e.clusterId, e.err.Error()) | |||||
| } | |||||
| return nil, errors.New(msg) | |||||
| } | |||||
| names = common.RemoveDuplicates(names) | names = common.RemoveDuplicates(names) | ||||
| return names, nil | return names, nil | ||||
| } | } | ||||
| @@ -116,24 +116,28 @@ func NewServiceContext(c config.Config) *ServiceContext { | |||||
| }) | }) | ||||
| // scheduler | // scheduler | ||||
| octopusRpc := octopusclient.NewOctopus(zrpc.MustNewClient(c.OctopusRpcConf)) | |||||
| aCRpc := hpcacclient.NewHpcAC(zrpc.MustNewClient(c.ACRpcConf)) | |||||
| modelArtsRpc := modelartsservice.NewModelArtsService(zrpc.MustNewClient(c.ModelArtsRpcConf)) | |||||
| modelArtsImgRpc := imagesservice.NewImagesService(zrpc.MustNewClient(c.ModelArtsImgRpcConf)) | |||||
| //octopusRpc := octopusclient.NewOctopus(zrpc.MustNewClient(c.OctopusRpcConf)) | |||||
| //aCRpc := hpcacclient.NewHpcAC(zrpc.MustNewClient(c.ACRpcConf)) | |||||
| //modelArtsRpc := modelartsservice.NewModelArtsService(zrpc.MustNewClient(c.ModelArtsRpcConf)) | |||||
| //modelArtsImgRpc := imagesservice.NewImagesService(zrpc.MustNewClient(c.ModelArtsImgRpcConf)) | |||||
| storage := &database.AiStorage{DbEngin: dbEngin} | storage := &database.AiStorage{DbEngin: dbEngin} | ||||
| aiExecutor, resourceCollector := service.InitAiClusterMap(octopusRpc, modelArtsRpc, modelArtsImgRpc, aCRpc, storage) | |||||
| scheduler := scheduler.NewSchdlr(resourceCollector, storage, aiExecutor) | |||||
| aiService, err := service.NewAiService(&c, storage) | |||||
| if err != nil { | |||||
| logx.Error(err.Error()) | |||||
| return nil | |||||
| } | |||||
| scheduler := scheduler.NewSchdlr(aiService, storage) | |||||
| return &ServiceContext{ | return &ServiceContext{ | ||||
| Cron: cron.New(cron.WithSeconds()), | Cron: cron.New(cron.WithSeconds()), | ||||
| DbEngin: dbEngin, | DbEngin: dbEngin, | ||||
| Config: c, | Config: c, | ||||
| RedisClient: redisClient, | RedisClient: redisClient, | ||||
| ModelArtsRpc: modelArtsRpc, | |||||
| ModelArtsImgRpc: modelArtsImgRpc, | |||||
| ModelArtsRpc: modelartsservice.NewModelArtsService(zrpc.MustNewClient(c.ModelArtsRpcConf)), | |||||
| ModelArtsImgRpc: imagesservice.NewImagesService(zrpc.MustNewClient(c.ModelArtsImgRpcConf)), | |||||
| CephRpc: cephclient.NewCeph(zrpc.MustNewClient(c.CephRpcConf)), | CephRpc: cephclient.NewCeph(zrpc.MustNewClient(c.CephRpcConf)), | ||||
| ACRpc: aCRpc, | |||||
| OctopusRpc: octopusRpc, | |||||
| ACRpc: hpcacclient.NewHpcAC(zrpc.MustNewClient(c.ACRpcConf)), | |||||
| OctopusRpc: octopusclient.NewOctopus(zrpc.MustNewClient(c.OctopusRpcConf)), | |||||
| OpenstackRpc: openstackclient.NewOpenstack(zrpc.MustNewClient(c.OpenstackRpcConf)), | OpenstackRpc: openstackclient.NewOpenstack(zrpc.MustNewClient(c.OpenstackRpcConf)), | ||||
| K8sRpc: kubernetesclient.NewKubernetes(zrpc.MustNewClient(c.K8sNativeConf)), | K8sRpc: kubernetesclient.NewKubernetes(zrpc.MustNewClient(c.K8sNativeConf)), | ||||
| MonitorClient: make(map[int64]tracker.Prometheus), | MonitorClient: make(map[int64]tracker.Prometheus), | ||||
| @@ -131,40 +131,22 @@ type TaskYaml struct { | |||||
| } | } | ||||
| type CommitVmTaskReq struct { | type CommitVmTaskReq struct { | ||||
| Name string `json:"name"` | |||||
| NsID string `json:"nsID"` | |||||
| Replicas int64 `json:"replicas,optional"` | |||||
| MatchLabels map[string]string `json:"matchLabels,optional"` | |||||
| Servers []ServerCommit `json:"servers,optional"` | |||||
| Platform string `json:"platform,optional"` | |||||
| AdapterId string `json:"adapterId,optional"` | |||||
| ClusterType string `json:"clusterType,optional"` | |||||
| } | |||||
| type ServerCommit struct { | |||||
| AllCardRunTime string `json:"allCardRunTime"` | |||||
| FlavorRef string `json:"flavorRef,optional"` | |||||
| Name string `json:"name,optional"` | |||||
| ImageRef string `json:"imageRef,optional"` | |||||
| AccessIPv4 string `json:"accessIPv4,optional"` | |||||
| AccessIPv6 string `json:"accessIPv6,optional"` | |||||
| AdminPass string `json:"adminPass,optional"` | |||||
| Availability_zone string `json:"availability_zone,optional"` | |||||
| Key_name string `json:"key_name,optional"` | |||||
| Hostname string `json:"hostname,optional"` | |||||
| Host string `json:"host,optional"` | |||||
| Networks []Networks `json:"networks,optional"` | |||||
| } | |||||
| type Networks struct { | |||||
| Uuid string `json:"uuid,optional"` | |||||
| Port string `json:"port,optional"` | |||||
| Fixed_ip string `json:"fixed_ip,optional"` | |||||
| Tag string `json:"tag,optional"` | |||||
| Name string `json:"name"` | |||||
| NsID string `json:"nsID"` | |||||
| Replicas int64 `json:"replicas,optional"` | |||||
| MatchLabels map[string]string `json:"matchLabels,optional"` | |||||
| AdapterId string `json:"adapterId,optional"` | |||||
| ClusterType string `json:"clusterType,optional"` | |||||
| CreateMulServer []CreateMulDomainServer `json:"createMulServer,optional"` | |||||
| } | } | ||||
| type Block_device_mapping_v2Commit struct { | |||||
| Uuid string `json:"uuid,optional"` | |||||
| type CreateMulDomainServer struct { | |||||
| Platform string `json:"platform,optional"` | |||||
| Name string `json:"name,optional"` | |||||
| Min_count int64 `json:"min_count,optional"` | |||||
| ImageRef string `json:"imageRef,optional"` | |||||
| FlavorRef string `json:"flavorRef,optional"` | |||||
| Uuid string `json:"uuid,optional"` | |||||
| } | } | ||||
| type CommitVmTaskResp struct { | type CommitVmTaskResp struct { | ||||
| @@ -5309,13 +5291,15 @@ type ScheduleResp struct { | |||||
| type ScheduleResult struct { | type ScheduleResult struct { | ||||
| ClusterId string `json:"clusterId"` | ClusterId string `json:"clusterId"` | ||||
| TaskId string `json:"taskId"` | TaskId string `json:"taskId"` | ||||
| Strategy string `json:"strategy"` | |||||
| Replica int32 `json:"replica"` | Replica int32 `json:"replica"` | ||||
| Msg string `json:"msg"` | Msg string `json:"msg"` | ||||
| } | } | ||||
| type AiOption struct { | type AiOption struct { | ||||
| TaskName string `json:"taskName"` | TaskName string `json:"taskName"` | ||||
| AiClusterId string `json:"aiClusterId,optional"` | |||||
| AdapterId string `json:"adapterId"` | |||||
| AiClusterIds []string `json:"aiClusterIds"` | |||||
| ResourceType string `json:"resourceType"` | ResourceType string `json:"resourceType"` | ||||
| Tops float64 `json:"Tops,optional"` | Tops float64 `json:"Tops,optional"` | ||||
| TaskType string `json:"taskType"` | TaskType string `json:"taskType"` | ||||
| @@ -5336,6 +5320,10 @@ type AiTaskTypesResp struct { | |||||
| TaskTypes []string `json:"taskTypes"` | TaskTypes []string `json:"taskTypes"` | ||||
| } | } | ||||
| type AiDatasetsReq struct { | |||||
| AdapterId string `path:"adapterId"` | |||||
| } | |||||
| type AiDatasetsResp struct { | type AiDatasetsResp struct { | ||||
| Datasets []string `json:"datasets"` | Datasets []string `json:"datasets"` | ||||
| } | } | ||||
| @@ -5345,6 +5333,7 @@ type AiStrategyResp struct { | |||||
| } | } | ||||
| type AiAlgorithmsReq struct { | type AiAlgorithmsReq struct { | ||||
| AdapterId string `path:"adapterId"` | |||||
| ResourceType string `path:"resourceType"` | ResourceType string `path:"resourceType"` | ||||
| TaskType string `path:"taskType"` | TaskType string `path:"taskType"` | ||||
| Dataset string `path:"dataset"` | Dataset string `path:"dataset"` | ||||
| @@ -5451,7 +5440,10 @@ type VmInfo struct { | |||||
| BlockUuid string `json:"block_uuid,omitempty"` | BlockUuid string `json:"block_uuid,omitempty"` | ||||
| SourceType string `json:"source_type,omitempty"` | SourceType string `json:"source_type,omitempty"` | ||||
| DeleteOnTermination bool `json:"delete_on_termination,omitempty"` | DeleteOnTermination bool `json:"delete_on_termination,omitempty"` | ||||
| State string `json:"state,omitempty"` | |||||
| Status string `json:"status,omitempty"` | |||||
| MinCount string `json:"min_count,omitempty"` | |||||
| Platform string `json:"platform,omitempty"` | |||||
| Uuid string `json:"uuid,omitempty"` | |||||
| } | } | ||||
| type PushTaskInfoReq struct { | type PushTaskInfoReq struct { | ||||
| @@ -5468,7 +5460,37 @@ type PushTaskInfoResp struct { | |||||
| } | } | ||||
| type PushResourceInfoReq struct { | type PushResourceInfoReq struct { | ||||
| AdapterId int64 `json:"adapterId"` | |||||
| AdapterId int64 `json:"adapterId"` | |||||
| ResourceStats []ResourceStats `json:"resourceStats"` | |||||
| } | |||||
| type PushResourceInfoResp struct { | |||||
| Code int64 `json:"code"` | |||||
| Msg string `json:"msg"` | |||||
| } | |||||
| type ResourceStats struct { | |||||
| ClusterId int64 `json:"clusterId"` | |||||
| Name string `json:"name"` | |||||
| CpuCoreAvail int64 `json:"cpuCoreAvail"` | |||||
| CpuCoreTotal int64 `json:"cpuCoreTotal"` | |||||
| MemAvail float64 `json:"memAvail"` | |||||
| MemTotal float64 `json:"memTotal"` | |||||
| DiskAvail float64 `json:"diskAvail"` | |||||
| DiskTotal float64 `json:"diskTotal"` | |||||
| GpuAvail int64 `json:"gpuAvail"` | |||||
| CardsAvail []*Card `json:"cardsAvail"` | |||||
| CpuCoreHours float64 `json:"cpuCoreHours"` | |||||
| Balance float64 `json:"balance"` | |||||
| } | |||||
| type Card struct { | |||||
| Platform string `json:"platform"` | |||||
| Type string `json:"type"` | |||||
| Name string `json:"name"` | |||||
| TOpsAtFp16 float64 `json:"TOpsAtFp16"` | |||||
| CardHours float64 `json:"cardHours"` | |||||
| CardNum int32 `json:"cardNum"` | |||||
| } | } | ||||
| type CreateAlertRuleReq struct { | type CreateAlertRuleReq struct { | ||||
| @@ -2,6 +2,8 @@ module gitlink.org.cn/JointCloud/pcm-coordinator | |||||
| go 1.21 | go 1.21 | ||||
| retract v0.1.20-0.20240319015239-6ae13da05255 | |||||
| require ( | require ( | ||||
| github.com/JCCE-nudt/zero-contrib/zrpc/registry/nacos v0.0.0-20230419021610-13bbc83fbc3c | github.com/JCCE-nudt/zero-contrib/zrpc/registry/nacos v0.0.0-20230419021610-13bbc83fbc3c | ||||
| github.com/Masterminds/squirrel v1.5.4 | github.com/Masterminds/squirrel v1.5.4 | ||||
| @@ -0,0 +1,24 @@ | |||||
| package models | |||||
| import "github.com/zeromicro/go-zero/core/stores/sqlx" | |||||
| var _ TaskVmModel = (*customTaskVmModel)(nil) | |||||
| type ( | |||||
| // TaskVmModel is an interface to be customized, add more methods here, | |||||
| // and implement the added methods in customTaskVmModel. | |||||
| TaskVmModel interface { | |||||
| taskVmModel | |||||
| } | |||||
| customTaskVmModel struct { | |||||
| *defaultTaskVmModel | |||||
| } | |||||
| ) | |||||
| // NewTaskVmModel returns a model for the database table. | |||||
| func NewTaskVmModel(conn sqlx.SqlConn) TaskVmModel { | |||||
| return &customTaskVmModel{ | |||||
| defaultTaskVmModel: newTaskVmModel(conn), | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,107 @@ | |||||
| // Code generated by goctl. DO NOT EDIT. | |||||
| package models | |||||
| import ( | |||||
| "context" | |||||
| "database/sql" | |||||
| "fmt" | |||||
| "strings" | |||||
| "github.com/zeromicro/go-zero/core/stores/builder" | |||||
| "github.com/zeromicro/go-zero/core/stores/sqlc" | |||||
| "github.com/zeromicro/go-zero/core/stores/sqlx" | |||||
| "github.com/zeromicro/go-zero/core/stringx" | |||||
| ) | |||||
| var ( | |||||
| taskVmFieldNames = builder.RawFieldNames(&TaskVm{}) | |||||
| taskVmRows = strings.Join(taskVmFieldNames, ",") | |||||
| taskVmRowsExpectAutoSet = strings.Join(stringx.Remove(taskVmFieldNames, "`id`", "`create_at`", "`create_time`", "`created_at`", "`update_at`", "`update_time`", "`updated_at`"), ",") | |||||
| taskVmRowsWithPlaceHolder = strings.Join(stringx.Remove(taskVmFieldNames, "`id`", "`create_at`", "`create_time`", "`created_at`", "`update_at`", "`update_time`", "`updated_at`"), "=?,") + "=?" | |||||
| ) | |||||
| type ( | |||||
| taskVmModel interface { | |||||
| Insert(ctx context.Context, data *TaskVm) (sql.Result, error) | |||||
| FindOne(ctx context.Context, id int64) (*TaskVm, error) | |||||
| Update(ctx context.Context, data *TaskVm) error | |||||
| Delete(ctx context.Context, id int64) error | |||||
| } | |||||
| defaultTaskVmModel struct { | |||||
| conn sqlx.SqlConn | |||||
| table string | |||||
| } | |||||
| TaskVm struct { | |||||
| Id int64 `db:"id"` // id | |||||
| ParticipantId int64 `db:"participant_id"` // p端id | |||||
| TaskId int64 `db:"task_id"` // 任务id | |||||
| Name string `db:"name"` // 虚拟机名称 | |||||
| AdapterId int64 `db:"adapter_id"` // 执行任务的适配器id | |||||
| ClusterId int64 `db:"cluster_id"` // 执行任务的集群id | |||||
| FlavorRef string `db:"flavor_ref"` // 规格索引 | |||||
| ImageRef string `db:"image_ref"` // 镜像索引 | |||||
| Status string `db:"status"` // 状态 | |||||
| Platform string `db:"platform"` // 平台 | |||||
| Description string `db:"description"` // 描述 | |||||
| AvailabilityZone string `db:"availability_zone"` | |||||
| MinCount int64 `db:"min_count"` // 数量 | |||||
| Uuid string `db:"uuid"` // 网络id | |||||
| StartTime string `db:"start_time"` // 开始时间 | |||||
| RunningTime string `db:"running_time"` // 运行时间 | |||||
| Result string `db:"result"` // 运行结果 | |||||
| DeletedAt string `db:"deleted_at"` // 删除时间 | |||||
| } | |||||
| ) | |||||
| func newTaskVmModel(conn sqlx.SqlConn) *defaultTaskVmModel { | |||||
| return &defaultTaskVmModel{ | |||||
| conn: conn, | |||||
| table: "`task_vm`", | |||||
| } | |||||
| } | |||||
| func (m *defaultTaskVmModel) withSession(session sqlx.Session) *defaultTaskVmModel { | |||||
| return &defaultTaskVmModel{ | |||||
| conn: sqlx.NewSqlConnFromSession(session), | |||||
| table: "`task_vm`", | |||||
| } | |||||
| } | |||||
| func (m *defaultTaskVmModel) Delete(ctx context.Context, id int64) error { | |||||
| query := fmt.Sprintf("delete from %s where `id` = ?", m.table) | |||||
| _, err := m.conn.ExecCtx(ctx, query, id) | |||||
| return err | |||||
| } | |||||
| func (m *defaultTaskVmModel) FindOne(ctx context.Context, id int64) (*TaskVm, error) { | |||||
| query := fmt.Sprintf("select %s from %s where `id` = ? limit 1", taskVmRows, m.table) | |||||
| var resp TaskVm | |||||
| err := m.conn.QueryRowCtx(ctx, &resp, query, id) | |||||
| switch err { | |||||
| case nil: | |||||
| return &resp, nil | |||||
| case sqlc.ErrNotFound: | |||||
| return nil, ErrNotFound | |||||
| default: | |||||
| return nil, err | |||||
| } | |||||
| } | |||||
| func (m *defaultTaskVmModel) Insert(ctx context.Context, data *TaskVm) (sql.Result, error) { | |||||
| query := fmt.Sprintf("insert into %s (%s) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", m.table, taskVmRowsExpectAutoSet) | |||||
| ret, err := m.conn.ExecCtx(ctx, query, data.ParticipantId, data.TaskId, data.Name, data.AdapterId, data.ClusterId, data.FlavorRef, data.ImageRef, data.Status, data.Platform, data.Description, data.AvailabilityZone, data.MinCount, data.Uuid, data.StartTime, data.RunningTime, data.Result, data.DeletedAt) | |||||
| return ret, err | |||||
| } | |||||
| func (m *defaultTaskVmModel) Update(ctx context.Context, data *TaskVm) error { | |||||
| query := fmt.Sprintf("update %s set %s where `id` = ?", m.table, taskVmRowsWithPlaceHolder) | |||||
| _, err := m.conn.ExecCtx(ctx, query, data.ParticipantId, data.TaskId, data.Name, data.AdapterId, data.ClusterId, data.FlavorRef, data.ImageRef, data.Status, data.Platform, data.Description, data.AvailabilityZone, data.MinCount, data.Uuid, data.StartTime, data.RunningTime, data.Result, data.DeletedAt, data.Id) | |||||
| return err | |||||
| } | |||||
| func (m *defaultTaskVmModel) tableName() string { | |||||
| return m.table | |||||
| } | |||||