You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

commithpctasklogic.go 10 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356
  1. package hpc
  2. import (
  3. "context"
  4. "fmt"
  5. jsoniter "github.com/json-iterator/go"
  6. "github.com/pkg/errors"
  7. "github.com/rs/zerolog/log"
  8. "github.com/zeromicro/go-zero/core/logx"
  9. clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/client"
  10. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service"
  11. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
  12. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
  13. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
  14. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
  15. "regexp"
  16. "strconv"
  17. "strings"
  18. "sync"
  19. "text/template"
  20. "time"
  21. )
  22. type CommitHpcTaskLogic struct {
  23. logx.Logger
  24. ctx context.Context
  25. svcCtx *svc.ServiceContext
  26. hpcService *service.HpcService
  27. }
  28. const (
  29. statusSaved = "Saved"
  30. statusDeploying = "Deploying"
  31. adapterTypeHPC = "2"
  32. )
  33. type JobRequest struct {
  34. App string `json:"app"`
  35. Common CommonParams `json:"common"`
  36. AppSpecific map[string]interface{} `json:"appSpecific"`
  37. }
  38. type CommonParams struct {
  39. JobName string `json:"jobName"`
  40. Partition string `json:"partition"`
  41. Nodes string `json:"nodes"`
  42. NTasks string `json:"ntasks"`
  43. Time string `json:"time,omitempty"`
  44. App string `json:"app"`
  45. }
  46. func NewCommitHpcTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext) *CommitHpcTaskLogic {
  47. cache := make(map[string]interface{}, 10)
  48. hpcService, err := service.NewHpcService(&svcCtx.Config, svcCtx.Scheduler.HpcStorages, cache)
  49. if err != nil {
  50. return nil
  51. }
  52. return &CommitHpcTaskLogic{
  53. Logger: logx.WithContext(ctx),
  54. ctx: ctx,
  55. svcCtx: svcCtx,
  56. hpcService: hpcService,
  57. }
  58. }
  59. // 新增:缓存模板对象
  60. var templateCache = sync.Map{}
  61. func (l *CommitHpcTaskLogic) getClusterInfo(clusterID string) (*types.ClusterInfo, *types.AdapterInfo, error) {
  62. var clusterInfo types.ClusterInfo
  63. if err := l.svcCtx.DbEngin.Table("t_cluster").Where("id = ?", clusterID).First(&clusterInfo).Error; err != nil {
  64. return nil, nil, fmt.Errorf("cluster query failed: %w", err)
  65. }
  66. if clusterInfo.Id == "" {
  67. return nil, nil, errors.New("cluster not found")
  68. }
  69. var adapterInfo types.AdapterInfo
  70. if err := l.svcCtx.DbEngin.Table("t_adapter").Where("id = ?", clusterInfo.AdapterId).First(&adapterInfo).Error; err != nil {
  71. return nil, nil, fmt.Errorf("adapter query failed: %w", err)
  72. }
  73. if adapterInfo.Id == "" {
  74. return nil, nil, errors.New("adapter not found")
  75. }
  76. return &clusterInfo, &adapterInfo, nil
  77. }
  78. // 自定义函数映射
  79. func createFuncMap() template.FuncMap {
  80. return template.FuncMap{
  81. "regexMatch": regexMatch,
  82. "required": required,
  83. "error": errorHandler,
  84. "default": defaultHandler,
  85. }
  86. }
  87. func extractUserError(originalErr error) error {
  88. // 尝试匹配模板引擎返回的错误格式
  89. re := regexp.MustCompile(`error calling \w+: (.*)$`)
  90. matches := re.FindStringSubmatch(originalErr.Error())
  91. if len(matches) > 1 {
  92. return errors.New(matches[1])
  93. }
  94. return originalErr
  95. }
  96. // 正则匹配函数
  97. func regexMatch(pattern string) *regexp.Regexp {
  98. return regexp.MustCompile(pattern)
  99. }
  100. // 必填字段检查
  101. func required(msg string, val interface{}) (interface{}, error) {
  102. if val == nil || val == "" {
  103. return nil, errors.New(msg)
  104. }
  105. return val, nil
  106. }
  107. // 错误处理函数
  108. func errorHandler(msg string) (string, error) {
  109. return "", errors.New(msg)
  110. }
  111. // 默认值处理函数
  112. func defaultHandler(defaultVal interface{}, val interface{}) interface{} {
  113. switch v := val.(type) {
  114. case nil:
  115. return defaultVal
  116. case string:
  117. if v == "" {
  118. return defaultVal
  119. }
  120. case int:
  121. if v == 0 {
  122. return defaultVal
  123. }
  124. // 可根据需要添加其他类型判断
  125. }
  126. return val
  127. }
  128. func (l *CommitHpcTaskLogic) RenderJobScript(templateContent string, req *JobRequest) (string, error) {
  129. // 使用缓存模板
  130. tmpl, ok := templateCache.Load(templateContent)
  131. if !ok {
  132. parsedTmpl, err := template.New("slurmTemplate").Funcs(createFuncMap()).Parse(templateContent)
  133. if err != nil {
  134. return "", err
  135. }
  136. templateCache.Store(templateContent, parsedTmpl)
  137. tmpl = parsedTmpl
  138. }
  139. params := map[string]interface{}{
  140. "Common": req.Common,
  141. "App": req.AppSpecific,
  142. }
  143. var buf strings.Builder
  144. if err := tmpl.(*template.Template).Execute(&buf, params); err != nil {
  145. log.Error().Err(err).Msg("模板渲染失败")
  146. return "", extractUserError(err)
  147. }
  148. return buf.String(), nil
  149. }
  150. func ConvertToJobRequest(job *types.CommitHpcTaskReq) (JobRequest, error) {
  151. required := []string{"jobName", "nodes", "ntasks"}
  152. for _, field := range required {
  153. if job.Parameters[field] == "" {
  154. return JobRequest{}, fmt.Errorf("%s is empty", field)
  155. }
  156. }
  157. return JobRequest{
  158. App: job.App,
  159. Common: CommonParams{
  160. JobName: job.Parameters["jobName"],
  161. Partition: job.Parameters["partition"],
  162. Nodes: job.Parameters["nodes"],
  163. NTasks: job.Parameters["ntasks"],
  164. Time: job.Parameters["time"],
  165. App: job.App,
  166. },
  167. AppSpecific: utils.MpaStringToInterface(job.Parameters),
  168. }, nil
  169. }
  170. func (l *CommitHpcTaskLogic) SaveHpcTaskToDB(req *types.CommitHpcTaskReq, jobScript, jobId, workDir string) (taskId string, err error) {
  171. // 使用事务确保数据一致性
  172. tx := l.svcCtx.DbEngin.Begin()
  173. defer func() {
  174. if r := recover(); r != nil {
  175. tx.Rollback()
  176. err = fmt.Errorf("transaction panic: %v", r)
  177. } else if err != nil {
  178. tx.Rollback()
  179. }
  180. }()
  181. userID, _ := strconv.ParseInt(req.Parameters["UserId"], 10, 64)
  182. taskID := utils.GenSnowflakeID()
  183. taskModel := models.Task{
  184. Id: taskID,
  185. Name: req.Name,
  186. Description: req.Description,
  187. CommitTime: time.Now(),
  188. Status: statusSaved,
  189. AdapterTypeDict: adapterTypeHPC,
  190. UserId: userID,
  191. }
  192. if err = tx.Table("task").Create(&taskModel).Error; err != nil {
  193. return "", fmt.Errorf("failed to create task: %w", err)
  194. }
  195. clusterInfo, adapterInfo, err := l.getClusterInfo(req.ClusterId)
  196. if err != nil {
  197. return "", err
  198. }
  199. paramsJSON, err := jsoniter.MarshalToString(req)
  200. if err != nil {
  201. return "", fmt.Errorf("failed to marshal parameters: %w", err)
  202. }
  203. clusterID := utils.StringToInt64(clusterInfo.Id)
  204. hpcTask := models.TaskHpc{
  205. Id: utils.GenSnowflakeID(),
  206. TaskId: taskID,
  207. AdapterId: clusterInfo.AdapterId,
  208. AdapterName: adapterInfo.Name,
  209. ClusterId: clusterID,
  210. ClusterName: clusterInfo.Name,
  211. Name: taskModel.Name,
  212. Backend: req.Backend,
  213. OperateType: req.OperateType,
  214. CmdScript: req.Parameters["cmdScript"],
  215. WallTime: req.Parameters["wallTime"],
  216. AppType: req.Parameters["appType"],
  217. AppName: req.App,
  218. Queue: req.Parameters["queue"],
  219. SubmitType: req.Parameters["submitType"],
  220. NNode: req.Parameters["nNode"],
  221. Account: clusterInfo.Username,
  222. StdInput: req.Parameters["stdInput"],
  223. Partition: req.Parameters["partition"],
  224. CreatedTime: time.Now(),
  225. UpdatedTime: time.Now(),
  226. Status: statusDeploying,
  227. UserId: userID,
  228. Params: paramsJSON,
  229. Script: jobScript,
  230. JobId: jobId,
  231. WorkDir: workDir,
  232. }
  233. if err = tx.Table("task_hpc").Create(&hpcTask).Error; err != nil {
  234. return "", fmt.Errorf("failed to create HPC task: %w", err)
  235. }
  236. noticeInfo := clientCore.NoticeInfo{
  237. AdapterId: clusterInfo.AdapterId,
  238. AdapterName: adapterInfo.Name,
  239. ClusterId: clusterID,
  240. ClusterName: clusterInfo.Name,
  241. NoticeType: "create",
  242. TaskName: req.Name,
  243. TaskId: taskID,
  244. Incident: "任务创建中",
  245. CreatedTime: time.Now(),
  246. }
  247. if err = tx.Table("t_notice").Create(&noticeInfo).Error; err != nil {
  248. return "", fmt.Errorf("failed to create notice: %w", err)
  249. }
  250. if err = tx.Commit().Error; err != nil {
  251. return "", fmt.Errorf("transaction commit failed: %w", err)
  252. }
  253. return utils.Int64ToString(taskID), nil
  254. }
  255. func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *types.CommitHpcTaskResp, err error) {
  256. jobName := generateJobName(req)
  257. req.Parameters["jobName"] = jobName
  258. // 获取集群和适配器信息
  259. clusterInfo, adapterInfo, err := l.getClusterInfo(req.ClusterId)
  260. if err != nil {
  261. return nil, err
  262. }
  263. scriptContent := req.ScriptContent
  264. if scriptContent == "" {
  265. // 获取模板
  266. var templateInfo types.HpcAppTemplateInfo
  267. tx := l.svcCtx.DbEngin.Table("hpc_app_template").
  268. Where("cluster_id = ? and app = ? ", req.ClusterId, req.App)
  269. if req.OperateType != "" {
  270. tx.Where("app_type = ?", req.OperateType)
  271. }
  272. if err := tx.First(&templateInfo).Error; err != nil {
  273. return nil, fmt.Errorf("failed to get template: %w", err)
  274. }
  275. // 转换请求参数
  276. jobRequest, err := ConvertToJobRequest(req)
  277. if err != nil {
  278. return nil, err
  279. }
  280. // 渲染脚本
  281. script, err := l.RenderJobScript(templateInfo.Content, &jobRequest)
  282. if err != nil {
  283. return nil, err
  284. }
  285. scriptContent = script
  286. }
  287. q, _ := jsoniter.MarshalToString(scriptContent)
  288. submitQ := types.SubmitHpcTaskReq{
  289. App: req.App,
  290. ClusterId: req.ClusterId,
  291. JobName: jobName,
  292. ScriptContent: scriptContent,
  293. Parameters: req.Parameters,
  294. Backend: req.Backend,
  295. }
  296. log.Info().Msgf("Submitting HPC task to cluster %s with params: %s", clusterInfo.Name, q)
  297. resp, err = l.hpcService.HpcExecutorAdapterMap[adapterInfo.Id].SubmitTask(l.ctx, submitQ)
  298. if err != nil {
  299. log.Error().Err(err).Msgf("提交HPC任务失败, cluster: %s, jobName: %s, scriptContent: %s", clusterInfo.Name, jobName, scriptContent)
  300. return nil, fmt.Errorf("网络请求失败,请稍后重试")
  301. }
  302. jobID := resp.Data.JobInfo["jobId"]
  303. workDir := resp.Data.JobInfo["jobDir"]
  304. taskID, err := l.SaveHpcTaskToDB(req, scriptContent, jobID, workDir)
  305. if err != nil {
  306. log.Error().Msgf("Failed to save task to DB: %v", err)
  307. return nil, fmt.Errorf("db save failed: %w", err)
  308. }
  309. resp.Data.JobInfo["taskId"] = taskID
  310. return resp, nil
  311. }
  312. func generateJobName(req *types.CommitHpcTaskReq) string {
  313. if req.OperateType == "" {
  314. return req.Name
  315. }
  316. return req.Name + "_" + req.OperateType
  317. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.