- package hpc
-
- import (
- "context"
- "errors"
- clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/client"
- "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
- "gitlink.org.cn/JointCloud/pcm-hpc/slurm"
- "strconv"
- "time"
-
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
-
- "github.com/zeromicro/go-zero/core/logx"
- )
-
- type CommitHpcTaskLogic struct {
- logx.Logger
- ctx context.Context
- svcCtx *svc.ServiceContext
- }
-
- func NewCommitHpcTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext) *CommitHpcTaskLogic {
- return &CommitHpcTaskLogic{
- Logger: logx.WithContext(ctx),
- ctx: ctx,
- svcCtx: svcCtx,
- }
- }
-
- func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *types.CommitHpcTaskResp, err error) {
-
- var clusterInfo types.ClusterInfo
- l.svcCtx.DbEngin.Raw("SELECT * FROM `t_cluster` where id = ?", req.ClusterId).First(&clusterInfo)
-
- if len(clusterInfo.Id) == 0 {
- return resp, errors.New("cluster not found")
- }
-
- // 构建主任务结构体
- taskModel := models.Task{
- Name: req.Name,
- Description: req.Description,
- CommitTime: time.Now(),
- Status: "Running",
- AdapterTypeDict: "2",
- }
-
- // 保存任务数据到数据库
- tx := l.svcCtx.DbEngin.Create(&taskModel)
- if tx.Error != nil {
- return nil, tx.Error
- }
-
- var adapterName string
- l.svcCtx.DbEngin.Raw("SELECT name FROM `t_adapter` where id = ?", clusterInfo.AdapterId).Scan(&adapterName)
- if len(adapterName) == 0 || adapterName == "" {
- return nil, errors.New("no corresponding adapter found")
- }
- clusterId, err := strconv.ParseInt(req.ClusterId, 10, 64)
- hpcInfo := models.TaskHpc{
- TaskId: taskModel.Id,
- AdapterId: clusterInfo.AdapterId,
- AdapterName: adapterName,
- ClusterId: clusterId,
- ClusterName: clusterInfo.Name,
- Name: taskModel.Name,
- CmdScript: req.CmdScript,
- StartTime: time.Now().String(),
- CardCount: req.CardCount,
- WorkDir: req.WorkDir,
- WallTime: req.WallTime,
- AppType: req.AppType,
- AppName: req.AppName,
- Queue: req.Queue,
- SubmitType: req.SubmitType,
- NNode: req.NNode,
- Account: clusterInfo.Username,
- StdInput: req.StdInput,
- Partition: req.Partition,
- CreatedTime: time.Now(),
- UpdatedTime: time.Now(),
- Status: "Running",
- }
- hpcInfo.WorkDir = clusterInfo.WorkDir + req.WorkDir
- tx = l.svcCtx.DbEngin.Create(&hpcInfo)
- if tx.Error != nil {
- return nil, tx.Error
- }
- // 提交job到指定集群
- jobId, err := submitJob(&hpcInfo, &clusterInfo)
- if err != nil {
- return nil, err
- }
- // 保存操作记录
- noticeInfo := clientCore.NoticeInfo{
- AdapterId: clusterInfo.AdapterId,
- AdapterName: adapterName,
- ClusterId: clusterId,
- ClusterName: clusterInfo.Name,
- NoticeType: "create",
- TaskName: req.Name,
- Incident: "任务创建中",
- CreatedTime: time.Now(),
- }
- result := l.svcCtx.DbEngin.Table("t_notice").Create(¬iceInfo)
- if result.Error != nil {
- logx.Errorf("Task creation failure, err: %v", result.Error)
- }
- resp = &types.CommitHpcTaskResp{
- JobId: string(jobId),
- }
-
- return resp, nil
- }
-
- func submitJob(hpcInfo *models.TaskHpc, clusterInfo *types.ClusterInfo) (string, error) {
-
- client, err := slurm.NewClient(slurm.ClientOptions{
- URL: clusterInfo.Server,
- ClientVersion: clusterInfo.Version,
- RestUserName: clusterInfo.Username,
- Token: clusterInfo.Token})
- if err != nil {
- return "", err
- }
- job, err := client.Job(slurm.JobOptions{})
- if err != nil {
- return "", err
- }
- // 封装请求参数
- submitReq := slurm.JobOptions{
- Script: hpcInfo.CmdScript,
- Job: &slurm.JobProperties{
- Account: hpcInfo.Account,
- Name: hpcInfo.Name,
- NTasks: 1,
- CurrentWorkingDirectory: hpcInfo.WorkDir,
- Partition: hpcInfo.Partition,
- Environment: map[string]string{"PATH": clusterInfo.EnvPath,
- "LD_LIBRARY_PATH": clusterInfo.EnvLdPath},
- },
- }
- submitReq.Job.StandardOutput = submitReq.Job.CurrentWorkingDirectory + "/%j.out"
- submitReq.Job.StandardError = submitReq.Job.CurrentWorkingDirectory + "/%j.err"
-
- jobResp, err := job.SubmitJob(clusterInfo.ProxyAddress, submitReq)
- if err != nil {
- return "", err
- }
- jobId := strconv.Itoa(jobResp.JobId)
- return jobId, nil
- }
|