diff --git a/desc/hpc/pcm-hpc.api b/desc/hpc/pcm-hpc.api index 92752fa3..6ffade88 100644 --- a/desc/hpc/pcm-hpc.api +++ b/desc/hpc/pcm-hpc.api @@ -205,22 +205,17 @@ type ( CreateTime string `json:"create_time"` UpdateTime string `json:"update_time"` } -// App string `json:"app"` -//Backend string `json:"backend" binding:"required,oneof=slurm sugonac"` // 后端类型:slurm/sugonac -//Partition string // 分区/队列名称 -//TaskId string `json:"taskId"` -//ClusterId string `json:"clusterId"` -//JobName string `json:"jobName"` -//ScriptContent string `json:"scriptContent"` -//ScriptDir string `json:"scriptDir"` -//Parameters map[string]string `json:"parameters"` -//TimeLimit time.Duration // 作业时间限制 + SubmitHpcTaskReq { App string `json:"app"` ClusterId string `json:"clusterId"` JobName string `json:"jobName"` ScriptContent string `json:"scriptContent"` Parameters map[string]string `json:"parameters"` - Backend string `json:"backend"` + Backend string `json:"backend"` } -) \ No newline at end of file +) + +type HpcAppClusterReq { + App string `form:"app"` +} \ No newline at end of file diff --git a/desc/pcm.api b/desc/pcm.api index 48bac639..3ea54c12 100644 --- a/desc/pcm.api +++ b/desc/pcm.api @@ -186,7 +186,7 @@ service pcm { @doc "同步指定资源规格" @handler syncResourceSpecHandler - put /core/ai/resourceSpec/sync (SyncResourceReq) returns (CommonResp) + put /core/ai/resourceSpec/sync (SyncResourceReq) returns (ListResult) @doc "获取指定资源规格详情" @handler detailResourceSpecHandler @@ -251,6 +251,10 @@ service pcm { @doc "超算任务日志" @handler getHpcTaskLogHandler get /hpc/jobLogs/:taskId (HpcTaskLogReq) returns (HpcTaskLogResp) + + @doc "查询超算应用支持的集群" + @handler getHpcAppClusterHandler + get /hpc/getHpcAppCluster (HpcAppClusterReq) returns (CommonResp) } //cloud二级接口 diff --git a/internal/handler/hpc/gethpcappclusterhandler.go b/internal/handler/hpc/gethpcappclusterhandler.go new file mode 100644 index 00000000..e85e5deb --- /dev/null +++ b/internal/handler/hpc/gethpcappclusterhandler.go @@ -0,0 +1,24 @@ +package hpc + +import ( + "github.com/zeromicro/go-zero/rest/httpx" + "gitlink.org.cn/JointCloud/pcm-coordinator/internal/logic/hpc" + "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" + "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types" + "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" + "net/http" +) + +func GetHpcAppClusterHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + var req types.HpcAppClusterReq + if err := httpx.Parse(r, &req); err != nil { + result.ParamErrorResult(r, w, err) + return + } + + l := hpc.NewGetHpcAppClusterLogic(r.Context(), svcCtx) + resp, err := l.GetHpcAppCluster(&req) + result.HttpResult(r, w, resp, err) + } +} diff --git a/internal/handler/routes.go b/internal/handler/routes.go index b32ee029..cd11309a 100644 --- a/internal/handler/routes.go +++ b/internal/handler/routes.go @@ -724,6 +724,12 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) { Path: "/hpc/commitHpcTask", Handler: hpc.CommitHpcTaskHandler(serverCtx), }, + { + // 查询超算应用支持的集群 + Method: http.MethodGet, + Path: "/hpc/getHpcAppCluster", + Handler: hpc.GetHpcAppClusterHandler(serverCtx), + }, { // 超算查询任务列表 Method: http.MethodGet, diff --git a/internal/logic/hpc/commithpctasklogic.go b/internal/logic/hpc/commithpctasklogic.go index 98a1ec2e..e5e02b73 100644 --- a/internal/logic/hpc/commithpctasklogic.go +++ b/internal/logic/hpc/commithpctasklogic.go @@ -13,6 +13,7 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" + "regexp" "strconv" "strings" "sync" @@ -85,13 +86,67 @@ func (l *CommitHpcTaskLogic) getClusterInfo(clusterID string) (*types.ClusterInf return &clusterInfo, &adapterInfo, nil } +// 自定义函数映射 +func createFuncMap() template.FuncMap { + return template.FuncMap{ + "regexMatch": regexMatch, + "required": required, + "error": errorHandler, + "default": defaultHandler, + } +} +func extractUserError(originalErr error) error { + // 尝试匹配模板引擎返回的错误格式 + re := regexp.MustCompile(`error calling \w+: (.*)$`) + matches := re.FindStringSubmatch(originalErr.Error()) + if len(matches) > 1 { + return errors.New(matches[1]) + } + return originalErr +} + +// 正则匹配函数 +func regexMatch(pattern string) *regexp.Regexp { + return regexp.MustCompile(pattern) +} + +// 必填字段检查 +func required(msg string, val interface{}) (interface{}, error) { + if val == nil || val == "" { + return nil, errors.New(msg) + } + return val, nil +} + +// 错误处理函数 +func errorHandler(msg string) (string, error) { + return "", errors.New(msg) +} + +// 默认值处理函数 +func defaultHandler(defaultVal interface{}, val interface{}) interface{} { + switch v := val.(type) { + case nil: + return defaultVal + case string: + if v == "" { + return defaultVal + } + case int: + if v == 0 { + return defaultVal + } + // 可根据需要添加其他类型判断 + } + return val +} func (l *CommitHpcTaskLogic) RenderJobScript(templateContent string, req *JobRequest) (string, error) { // 使用缓存模板 tmpl, ok := templateCache.Load(templateContent) if !ok { - parsedTmpl, err := template.New("jobScript").Parse(templateContent) + parsedTmpl, err := template.New("slurmTemplate").Funcs(createFuncMap()).Parse(templateContent) if err != nil { - return "", fmt.Errorf("template parse failed: %w", err) + return "", err } templateCache.Store(templateContent, parsedTmpl) tmpl = parsedTmpl @@ -104,7 +159,8 @@ func (l *CommitHpcTaskLogic) RenderJobScript(templateContent string, req *JobReq var buf strings.Builder if err := tmpl.(*template.Template).Execute(&buf, params); err != nil { - return "", fmt.Errorf("template render failed: %w", err) + log.Error().Err(err).Msg("模板渲染失败") + return "", extractUserError(err) } return buf.String(), nil } @@ -235,36 +291,39 @@ func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *t if err != nil { return nil, err } + scriptContent := req.ScriptContent + if scriptContent == "" { + // 获取模板 + var templateInfo types.HpcAppTemplateInfo + tx := l.svcCtx.DbEngin.Table("hpc_app_template"). + Where("cluster_id = ? and app = ? ", req.ClusterId, req.App) + if req.OperateType != "" { + tx.Where("app_type = ?", req.OperateType) + } + if err := tx.First(&templateInfo).Error; err != nil { + return nil, fmt.Errorf("failed to get template: %w", err) + } - // 获取模板 - var templateInfo types.HpcAppTemplateInfo - tx := l.svcCtx.DbEngin.Table("hpc_app_template"). - Where("cluster_id = ? and app = ? ", req.ClusterId, req.App) - if req.OperateType != "" { - tx.Where("app_type = ?", req.OperateType) - } - if err := tx.First(&templateInfo).Error; err != nil { - return nil, fmt.Errorf("failed to get template: %w", err) - } - - // 转换请求参数 - jobRequest, err := ConvertToJobRequest(req) - if err != nil { - return nil, fmt.Errorf("invalid job request: %w", err) - } + // 转换请求参数 + jobRequest, err := ConvertToJobRequest(req) + if err != nil { + return nil, err + } - // 渲染脚本 - script, err := l.RenderJobScript(templateInfo.Content, &jobRequest) - if err != nil { - return nil, fmt.Errorf("script rendering failed: %w", err) + // 渲染脚本 + script, err := l.RenderJobScript(templateInfo.Content, &jobRequest) + if err != nil { + return nil, err + } + scriptContent = script } - q, _ := jsoniter.MarshalToString(script) + q, _ := jsoniter.MarshalToString(scriptContent) submitQ := types.SubmitHpcTaskReq{ App: req.App, ClusterId: req.ClusterId, JobName: jobName, - ScriptContent: script, + ScriptContent: scriptContent, Parameters: req.Parameters, Backend: req.Backend, } @@ -276,7 +335,7 @@ func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *t jobID := resp.Data.JobInfo["jobId"] workDir := resp.Data.JobInfo["jobDir"] - taskID, err := l.SaveHpcTaskToDB(req, script, jobID, workDir) + taskID, err := l.SaveHpcTaskToDB(req, scriptContent, jobID, workDir) if err != nil { log.Error().Msgf("Failed to save task to DB: %v", err) return nil, fmt.Errorf("db save failed: %w", err) diff --git a/internal/logic/hpc/gethpcappclusterlogic.go b/internal/logic/hpc/gethpcappclusterlogic.go new file mode 100644 index 00000000..39890217 --- /dev/null +++ b/internal/logic/hpc/gethpcappclusterlogic.go @@ -0,0 +1,39 @@ +package hpc + +import ( + "context" + "github.com/rs/zerolog/log" + + "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" + "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types" + + "github.com/zeromicro/go-zero/core/logx" +) + +type GetHpcAppClusterLogic struct { + logx.Logger + ctx context.Context + svcCtx *svc.ServiceContext +} + +func NewGetHpcAppClusterLogic(ctx context.Context, svcCtx *svc.ServiceContext) *GetHpcAppClusterLogic { + return &GetHpcAppClusterLogic{ + Logger: logx.WithContext(ctx), + ctx: ctx, + svcCtx: svcCtx, + } +} + +func (l *GetHpcAppClusterLogic) GetHpcAppCluster(req *types.HpcAppClusterReq) (resp *types.ListResult, err error) { + resp = &types.ListResult{} + var clusterIds []string + err = l.svcCtx.DbEngin.Table("hpc_app_template").Distinct("cluster_id"). + Where(" app = ? and status = 1 and deleted_at is null", req.App). + Find(&clusterIds).Error + if err != nil { + log.Error().Msgf("GetHpcAppCluster err:%v", err) + return nil, err + } + resp.List = clusterIds + return +} diff --git a/internal/types/types.go b/internal/types/types.go index 018ca64a..4fb79270 100644 --- a/internal/types/types.go +++ b/internal/types/types.go @@ -2786,6 +2786,10 @@ type Hooks struct { ContainerHooksResp ContainerHooksResp `json:"containerHooks,omitempty" copier:"ContainerHooksResp"` // * } +type HpcAppClusterReq struct { + App string `form:"app"` +} + type HpcAppTemplateInfo struct { Id int64 `json:"id"` Name string `json:"name"`