You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

commithpctasklogic.go 5.6 kB

11 months ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. package hpc
  2. import (
  3. "context"
  4. "errors"
  5. jsoniter "github.com/json-iterator/go"
  6. clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/client"
  7. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service"
  8. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
  9. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
  10. "strconv"
  11. "time"
  12. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
  13. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
  14. "github.com/zeromicro/go-zero/core/logx"
  15. )
  16. type CommitHpcTaskLogic struct {
  17. logx.Logger
  18. ctx context.Context
  19. svcCtx *svc.ServiceContext
  20. hpcService *service.HpcService
  21. }
  22. func NewCommitHpcTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext) *CommitHpcTaskLogic {
  23. cache := make(map[string]interface{}, 10)
  24. hpcService, err := service.NewHpcService(&svcCtx.Config, svcCtx.Scheduler.HpcStorages, cache)
  25. if err != nil {
  26. return nil
  27. }
  28. return &CommitHpcTaskLogic{
  29. Logger: logx.WithContext(ctx),
  30. ctx: ctx,
  31. svcCtx: svcCtx,
  32. hpcService: hpcService,
  33. }
  34. }
  35. func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *types.CommitHpcTaskResp, err error) {
  36. req.Parameters["jobName"] = generateJobName(req)
  37. reqStr, _ := jsoniter.MarshalToString(req)
  38. yaml := utils.StringToYaml(reqStr)
  39. var clusterInfo types.ClusterInfo
  40. l.svcCtx.DbEngin.Raw("SELECT * FROM `t_cluster` where id = ?", req.ClusterId).First(&clusterInfo)
  41. if len(clusterInfo.Id) == 0 {
  42. return resp, errors.New("cluster not found")
  43. }
  44. // 构建主任务结构体
  45. userId, _ := strconv.ParseInt(req.Parameters["UserId"], 10, 64)
  46. taskModel := models.Task{
  47. Id: utils.GenSnowflakeID(),
  48. Name: req.Name,
  49. Description: req.Description,
  50. CommitTime: time.Now(),
  51. Status: "Saved",
  52. AdapterTypeDict: "2",
  53. UserId: userId,
  54. YamlString: *yaml,
  55. }
  56. // 保存任务数据到数据库
  57. tx := l.svcCtx.DbEngin.Create(&taskModel)
  58. if tx.Error != nil {
  59. return nil, tx.Error
  60. }
  61. var adapterInfo types.AdapterInfo
  62. l.svcCtx.DbEngin.Raw("SELECT * FROM `t_adapter` where id = ?", clusterInfo.AdapterId).Scan(&adapterInfo)
  63. if adapterInfo.Id == "" {
  64. return resp, errors.New("adapter not found")
  65. }
  66. clusterId, err := strconv.ParseInt(req.ClusterId, 10, 64)
  67. cardCount, _ := strconv.ParseInt(req.Parameters["cardCount"], 10, 64)
  68. timelimit, _ := strconv.ParseInt(req.Parameters["timeLimit"], 10, 64)
  69. hpcInfo := models.TaskHpc{
  70. Id: utils.GenSnowflakeID(),
  71. TaskId: taskModel.Id,
  72. AdapterId: clusterInfo.AdapterId,
  73. AdapterName: adapterInfo.Name,
  74. ClusterId: clusterId,
  75. ClusterName: clusterInfo.Name,
  76. Name: taskModel.Name,
  77. Backend: req.Backend,
  78. OperateType: req.OperateType,
  79. CmdScript: req.Parameters["cmdScript"],
  80. CardCount: cardCount,
  81. WorkDir: req.Parameters["workDir"],
  82. WallTime: req.Parameters["wallTime"],
  83. AppType: req.Parameters["appType"],
  84. AppName: req.App,
  85. Queue: req.Parameters["queue"],
  86. SubmitType: req.Parameters["submitType"],
  87. NNode: req.Parameters["nNode"],
  88. Account: clusterInfo.Username,
  89. StdInput: req.Parameters["stdInput"],
  90. Partition: req.Parameters["partition"],
  91. CreatedTime: time.Now(),
  92. UpdatedTime: time.Now(),
  93. Status: "Deploying",
  94. TimeLimit: timelimit,
  95. UserId: userId,
  96. YamlString: *yaml,
  97. }
  98. hpcInfo.WorkDir = clusterInfo.WorkDir + req.Parameters["WorkDir"]
  99. tx = l.svcCtx.DbEngin.Create(&hpcInfo)
  100. if tx.Error != nil {
  101. return nil, tx.Error
  102. }
  103. // 保存操作记录
  104. noticeInfo := clientCore.NoticeInfo{
  105. AdapterId: clusterInfo.AdapterId,
  106. AdapterName: adapterInfo.Name,
  107. ClusterId: clusterId,
  108. ClusterName: clusterInfo.Name,
  109. NoticeType: "create",
  110. TaskName: req.Name,
  111. TaskId: taskModel.Id,
  112. Incident: "任务创建中",
  113. CreatedTime: time.Now(),
  114. }
  115. result := l.svcCtx.DbEngin.Table("t_notice").Create(&noticeInfo)
  116. if result.Error != nil {
  117. logx.Errorf("Task creation failure, err: %v", result.Error)
  118. }
  119. // 数据上链
  120. // 查询资源价格
  121. //var price int64
  122. //l.svcCtx.DbEngin.Raw("select price from `resource_cost` where resource_id = ?", clusterId).Scan(&price)
  123. //bytes, _ := json.Marshal(taskModel)
  124. //remoteUtil.Evidence(remoteUtil.EvidenceParam{
  125. // UserIp: req.Parameters["UserIp"],
  126. // Url: l.svcCtx.Config.BlockChain.Url,
  127. // ContractAddress: l.svcCtx.Config.BlockChain.ContractAddress,
  128. // FunctionName: l.svcCtx.Config.BlockChain.FunctionName,
  129. // Type: l.svcCtx.Config.BlockChain.Type,
  130. // Token: req.Parameters["Token"],
  131. // Amount: price,
  132. // Args: []string{strconv.FormatInt(taskModel.Id, 10), string(bytes)},
  133. //})
  134. // 提交job到指定集群
  135. logx.Info("提交job到指定集群")
  136. resp, err = l.hpcService.HpcExecutorAdapterMap[adapterInfo.Id].SubmitTask(l.ctx, *req)
  137. if err != nil {
  138. logx.Errorf("提交Hpc到指定集群失败, err: %v", err)
  139. return nil, err
  140. }
  141. // 更新任务状态
  142. updates := l.svcCtx.DbEngin.Model(&hpcInfo).Updates(models.TaskHpc{
  143. Id: hpcInfo.Id,
  144. JobId: resp.Data.JobInfo["jobId"],
  145. WorkDir: resp.Data.JobInfo["jobDir"],
  146. })
  147. if updates.Error != nil {
  148. return nil, updates.Error
  149. }
  150. resp.Data.JobInfo["taskId"] = strconv.FormatInt(taskModel.Id, 10)
  151. logx.Infof("提交job到指定集群成功, resp: %v", resp)
  152. return resp, nil
  153. }
  154. // generateJobName 根据条件生成 jobName
  155. func generateJobName(req *types.CommitHpcTaskReq) string {
  156. if req.OperateType == "" {
  157. return req.Name
  158. }
  159. return req.Name + "_" + req.OperateType
  160. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.