Browse Source

xjlab 接口

zj_dev
jagger devad 3 months ago
parent
commit
4f7e47485b
27 changed files with 1239 additions and 176 deletions
  1. +4
    -1
      etc/pcm.yaml
  2. +3
    -1
      internal/handler/cloud/commitgeneraltaskhandler.go
  3. +4
    -2
      internal/handler/cloud/containercreatehandler.go
  4. +6
    -3
      internal/handler/hpc/commithpctaskhandler.go
  5. +25
    -0
      internal/handler/routes.go
  6. +3
    -1
      internal/handler/schedule/schedulecreatetaskhandler.go
  7. +64
    -0
      internal/handler/xjlab/task.go
  8. +6
    -4
      internal/logic/cloud/commitgeneraltasklogic.go
  9. +5
    -3
      internal/logic/cloud/containercreatelogic.go
  10. +53
    -43
      internal/logic/hpc/commithpctasklogic.go
  11. +8
    -7
      internal/logic/schedule/schedulecreatetasklogic.go
  12. +4
    -3
      internal/logic/schedule/schedulesubmitlogic.go
  13. +275
    -0
      internal/logic/xjlab/pagelisttasklogic.go
  14. +27
    -0
      internal/logic/xjlab/task_resource_usage.go
  15. +155
    -0
      internal/logic/xjlab/task_status_statistics.go
  16. +25
    -21
      internal/scheduler/database/aiStorage.go
  17. +4
    -3
      internal/scheduler/scheduler.go
  18. +5
    -4
      internal/scheduler/schedulers/aiScheduler.go
  19. +10
    -9
      internal/scheduler/service/inference/imageInference/imageInference.go
  20. +1
    -1
      internal/scheduler/service/inference/textInference/textInference.go
  21. +1
    -0
      internal/types/cloud/container.go
  22. +52
    -0
      internal/types/types.go
  23. +24
    -0
      pkg/constants/const.go
  24. +22
    -21
      pkg/models/taskaimodel_gen.go
  25. +58
    -49
      pkg/models/taskhpcmodel_gen.go
  26. +1
    -0
      pkg/models/taskmodel_gen.go
  27. +394
    -0
      pkg/utils/slurm_parser.go

+ 4
- 1
etc/pcm.yaml View File

@@ -94,4 +94,7 @@ JcsMiddleware:


Participant: Participant:
AdapterId: "1777144940456666666" AdapterId: "1777144940456666666"
CloudAdapterId: "1770658294298316800"
CloudAdapterId: "1770658294298316800"

JccUserService:
Url: http://jcce-admin:8082/jcc-admin/admin/user/{id}

+ 3
- 1
internal/handler/cloud/commitgeneraltaskhandler.go View File

@@ -1,11 +1,12 @@
package cloud package cloud


import ( import (
"net/http"

"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
"k8s.io/apimachinery/pkg/util/json" "k8s.io/apimachinery/pkg/util/json"
"net/http"


"github.com/zeromicro/go-zero/rest/httpx" "github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/logic/cloud" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/logic/cloud"
@@ -32,6 +33,7 @@ func CommitGeneralTaskHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
user := &models.JccUserInfo{} user := &models.JccUserInfo{}
json.Unmarshal([]byte(userStr), user) json.Unmarshal([]byte(userStr), user)
req.UserId = user.Id req.UserId = user.Id
req.UserName = user.UserName
l := cloud.NewCommitGeneralTaskLogic(r.Context(), svcCtx) l := cloud.NewCommitGeneralTaskLogic(r.Context(), svcCtx)
resp, err := l.CommitGeneralTask(&req) resp, err := l.CommitGeneralTask(&req)
result.HttpResult(r, w, resp, err) result.HttpResult(r, w, resp, err)


+ 4
- 2
internal/handler/cloud/containercreatehandler.go View File

@@ -1,12 +1,13 @@
package cloud package cloud


import ( import (
"io"
"net/http"

"gitlink.org.cn/JointCloud/pcm-coordinator/internal/logic/cloud" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/logic/cloud"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"io"
"k8s.io/apimachinery/pkg/util/json" "k8s.io/apimachinery/pkg/util/json"
"net/http"


"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
container "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types/cloud" container "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types/cloud"
@@ -30,6 +31,7 @@ func ContainerCreateHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
user := &models.JccUserInfo{} user := &models.JccUserInfo{}
json.Unmarshal([]byte(userStr), user) json.Unmarshal([]byte(userStr), user)
req.UserId = user.Id req.UserId = user.Id
req.UserName = user.UserName
l := cloud.NewContainerCreateLogic(r.Context(), svcCtx) l := cloud.NewContainerCreateLogic(r.Context(), svcCtx)
resp, err := l.ContainerCreate(&req) resp, err := l.ContainerCreate(&req)
result.HttpResult(r, w, resp, err) result.HttpResult(r, w, resp, err)


+ 6
- 3
internal/handler/hpc/commithpctaskhandler.go View File

@@ -1,14 +1,16 @@
package hpc package hpc


import ( import (
"net/http"
"strconv"

"github.com/zeromicro/go-zero/rest/httpx" "github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/logic/hpc" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/logic/hpc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
"net/http"
"strconv"
) )


func CommitHpcTaskHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { func CommitHpcTaskHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
@@ -30,7 +32,8 @@ func CommitHpcTaskHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
result.ParamErrorResult(r, w, err) result.ParamErrorResult(r, w, err)
return return
} }
req.Parameters["UserId"] = strconv.FormatInt(jccUserInfo.Id, 10)
req.Parameters[constants.UserId] = strconv.FormatInt(jccUserInfo.Id, 10)
req.Parameters[constants.UserName] = jccUserInfo.UserName


l := hpc.NewCommitHpcTaskLogic(r.Context(), svcCtx) l := hpc.NewCommitHpcTaskLogic(r.Context(), svcCtx)
resp, err := l.CommitHpcTask(&req) resp, err := l.CommitHpcTask(&req)


+ 25
- 0
internal/handler/routes.go View File

@@ -16,6 +16,7 @@ import (
storage "gitlink.org.cn/JointCloud/pcm-coordinator/internal/handler/storage" storage "gitlink.org.cn/JointCloud/pcm-coordinator/internal/handler/storage"
storelink "gitlink.org.cn/JointCloud/pcm-coordinator/internal/handler/storelink" storelink "gitlink.org.cn/JointCloud/pcm-coordinator/internal/handler/storelink"
vm "gitlink.org.cn/JointCloud/pcm-coordinator/internal/handler/vm" vm "gitlink.org.cn/JointCloud/pcm-coordinator/internal/handler/vm"
xjlab "gitlink.org.cn/JointCloud/pcm-coordinator/internal/handler/xjlab"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"


"github.com/zeromicro/go-zero/rest" "github.com/zeromicro/go-zero/rest"
@@ -1735,4 +1736,28 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) {
}, },
rest.WithPrefix("/pcm/v1"), rest.WithPrefix("/pcm/v1"),
) )

server.AddRoutes(
[]rest.Route{
{
// 查询任务列表
Method: http.MethodGet,
Path: "/xjlab/taskList",
Handler: xjlab.TaskListHandler(serverCtx),
},
{
// 查询指定任务资源使用情况
Method: http.MethodGet,
Path: "/xjlab/taskResourceUsage",
Handler: xjlab.TaskResourceUsageHandler(serverCtx),
},
{
//任务状态监控
Method: http.MethodGet,
Path: "/xjlab/taskStatusStatistics",
Handler: xjlab.TaskStatusStatisticsHandler(serverCtx),
},
},
rest.WithPrefix("/pcm/v1"),
)
} }

+ 3
- 1
internal/handler/schedule/schedulecreatetaskhandler.go View File

@@ -1,13 +1,14 @@
package schedule package schedule


import ( import (
"net/http"

"github.com/zeromicro/go-zero/rest/httpx" "github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/logic/schedule" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/logic/schedule"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
"net/http"
) )


func ScheduleCreateTaskHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { func ScheduleCreateTaskHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
@@ -29,6 +30,7 @@ func ScheduleCreateTaskHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return return
} }
req.UserId = jccUserInfo.Id req.UserId = jccUserInfo.Id
req.UserName = jccUserInfo.UserName
l := schedule.NewScheduleCreateTaskLogic(r.Context(), svcCtx) l := schedule.NewScheduleCreateTaskLogic(r.Context(), svcCtx)
resp, err := l.ScheduleCreateTask(&req) resp, err := l.ScheduleCreateTask(&req)
result.HttpResult(r, w, resp, err) result.HttpResult(r, w, resp, err)


+ 64
- 0
internal/handler/xjlab/task.go View File

@@ -0,0 +1,64 @@
package xjlab

import (
"net/http"

"github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/logic/xjlab"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
)

func TaskListHandler(ctx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
var req types.XJLABTaskReq
if err := httpx.Parse(r, &req); err != nil {
result.ParamErrorResult(r, w, err)
return
}
token := r.Header.Get("Authorization")
// 获取用户信息
jccUserInfo, err := utils.ParseTokenWithoutVerify(token)
if err != nil {
result.ParamErrorResult(r, w, err)
return
}
req.UserId = jccUserInfo.Id
if req.UserName == "" {
req.UserName = jccUserInfo.UserName
}
l := xjlab.NewPageListTaskLogic(r.Context(), ctx)
resp, err := l.PageListTask(&req)
result.HttpResult(r, w, resp, err)
}
}

func TaskResourceUsageHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
var req types.FId
if err := httpx.Parse(r, &req); err != nil {
result.ParamErrorResult(r, w, err)
return
}

l := xjlab.NewTaskResourceUsageLogic(r.Context(), svcCtx)
resp, err := l.TaskResourceUsage(&req)
result.HttpResult(r, w, resp, err)
}
}

func TaskStatusStatisticsHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
var req types.XJLABCommonReq
if err := httpx.Parse(r, &req); err != nil {
result.ParamErrorResult(r, w, err)
return
}

l := xjlab.NewTaskStatusStatisticsLogic(r.Context(), svcCtx)
resp, err := l.GetSimpleTaskStatistics(&req)
result.HttpResult(r, w, resp, err)
}
}

+ 6
- 4
internal/logic/cloud/commitgeneraltasklogic.go View File

@@ -3,6 +3,11 @@ package cloud
import ( import (
"bytes" "bytes"
"context" "context"
"io"
"strconv"
"strings"
"time"

"github.com/pkg/errors" "github.com/pkg/errors"
clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/client" clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/client"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers"
@@ -13,15 +18,11 @@ import (
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models/cloud" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models/cloud"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils/remoteUtil" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils/remoteUtil"
"io"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime"
syaml "k8s.io/apimachinery/pkg/runtime/serializer/yaml" syaml "k8s.io/apimachinery/pkg/runtime/serializer/yaml"
"k8s.io/apimachinery/pkg/util/json" "k8s.io/apimachinery/pkg/util/json"
kyaml "k8s.io/apimachinery/pkg/util/yaml" kyaml "k8s.io/apimachinery/pkg/util/yaml"
"strconv"
"strings"
"time"


"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
@@ -102,6 +103,7 @@ func (l *CommitGeneralTaskLogic) CommitGeneralTask(req *types.GeneralTaskReq) (r
SynergyStatus: synergyStatus, SynergyStatus: synergyStatus,
Strategy: strategy, Strategy: strategy,
UserId: req.UserId, UserId: req.UserId,
UserName: req.UserName,
} }
resp.TaskId = taskModel.Id resp.TaskId = taskModel.Id
var taskClouds []cloud.TaskCloudModel var taskClouds []cloud.TaskCloudModel


+ 5
- 3
internal/logic/cloud/containercreatelogic.go View File

@@ -18,6 +18,10 @@ import (
"context" "context"
"errors" "errors"
"fmt" "fmt"
"net/http"
"strconv"
"time"

"github.com/zeromicro/go-zero/core/logx" "github.com/zeromicro/go-zero/core/logx"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/participant/cloud" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/participant/cloud"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
@@ -26,9 +30,6 @@ import (
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
cloud2 "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models/cloud" cloud2 "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models/cloud"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
"net/http"
"strconv"
"time"
) )


type ContainerCreateLogic struct { type ContainerCreateLogic struct {
@@ -79,6 +80,7 @@ func (l *ContainerCreateLogic) ContainerCreate(req *container.CreateParam) (resp
Description: req.Description, Description: req.Description,
Name: req.Name, Name: req.Name,
UserId: req.UserId, UserId: req.UserId,
UserName: req.UserName,
AdapterTypeDict: "0", AdapterTypeDict: "0",
CommitTime: time.Now(), CommitTime: time.Now(),
} }


+ 53
- 43
internal/logic/hpc/commithpctasklogic.go View File

@@ -3,6 +3,13 @@ package hpc
import ( import (
"context" "context"
"fmt" "fmt"
"regexp"
"strconv"
"strings"
"sync"
"text/template"
"time"

jsoniter "github.com/json-iterator/go" jsoniter "github.com/json-iterator/go"
"github.com/pkg/errors" "github.com/pkg/errors"
"github.com/rs/zerolog/log" "github.com/rs/zerolog/log"
@@ -12,14 +19,9 @@ import (
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
"regexp"
"strconv"
"strings"
"sync"
"text/template"
"time"
) )


type CommitHpcTaskLogic struct { type CommitHpcTaskLogic struct {
@@ -29,12 +31,6 @@ type CommitHpcTaskLogic struct {
hpcService *service.HpcService hpcService *service.HpcService
} }


const (
statusSaved = "Saved"
statusDeploying = "Deploying"
adapterTypeHPC = "2"
)

type JobRequest struct { type JobRequest struct {
App string `json:"app"` App string `json:"app"`
Common CommonParams `json:"common"` Common CommonParams `json:"common"`
@@ -201,16 +197,18 @@ func (l *CommitHpcTaskLogic) SaveHpcTaskToDB(req *types.CommitHpcTaskReq, jobScr
} }
}() }()


userID, _ := strconv.ParseInt(req.Parameters["UserId"], 10, 64)
userID, _ := strconv.ParseInt(req.Parameters[constants.UserId], 10, 64)
taskID := utils.GenSnowflakeID() taskID := utils.GenSnowflakeID()

taskModel := models.Task{ taskModel := models.Task{
Id: taskID, Id: taskID,
Name: req.Name, Name: req.Name,
Description: req.Description, Description: req.Description,
CommitTime: time.Now(), CommitTime: time.Now(),
Status: statusSaved,
AdapterTypeDict: adapterTypeHPC,
Status: constants.StatusSaved,
AdapterTypeDict: constants.AdapterTypeHPC,
UserId: userID, UserId: userID,
UserName: req.Parameters[constants.UserName],
} }


if err = tx.Table("task").Create(&taskModel).Error; err != nil { if err = tx.Table("task").Create(&taskModel).Error; err != nil {
@@ -226,36 +224,48 @@ func (l *CommitHpcTaskLogic) SaveHpcTaskToDB(req *types.CommitHpcTaskReq, jobScr
if err != nil { if err != nil {
return "", fmt.Errorf("failed to marshal parameters: %w", err) return "", fmt.Errorf("failed to marshal parameters: %w", err)
} }

//解析slurm脚本内容
var resource models.ResourceSpec
if req.Backend == string(constants.HPC_SYSTEM_SLURM) {
parser := utils.NewSlurmParser()
slurmResource := parser.ParseScript(jobScript)
resource = models.ResourceSpec{
//资源规格名称,采用拼接的方式 集群名+队列名
ResourceName: fmt.Sprintf("%s_%s", clusterInfo.Name, slurmResource.Partition),
Partition: slurmResource.Partition,
Specifications: slurmResource,
}
}
clusterID := utils.StringToInt64(clusterInfo.Id) clusterID := utils.StringToInt64(clusterInfo.Id)
hpcTask := models.TaskHpc{ hpcTask := models.TaskHpc{
Id: utils.GenSnowflakeID(),
TaskId: taskID,
AdapterId: clusterInfo.AdapterId,
AdapterName: adapterInfo.Name,
ClusterId: clusterID,
ClusterName: clusterInfo.Name,
Name: taskModel.Name,
Backend: req.Backend,
OperateType: req.OperateType,
CmdScript: req.Parameters["cmdScript"],
WallTime: req.Parameters["wallTime"],
AppType: req.Parameters["appType"],
AppName: req.App,
Queue: req.Parameters["queue"],
SubmitType: req.Parameters["submitType"],
NNode: req.Parameters["nNode"],
Account: clusterInfo.Username,
StdInput: req.Parameters["stdInput"],
Partition: req.Parameters["partition"],
CreatedTime: time.Now(),
UpdatedTime: time.Now(),
Status: statusDeploying,
UserId: userID,
Params: paramsJSON,
Script: jobScript,
JobId: jobId,
WorkDir: workDir,
Id: utils.GenSnowflakeID(),
TaskId: taskID,
AdapterId: clusterInfo.AdapterId,
AdapterName: adapterInfo.Name,
ClusterId: clusterID,
ClusterName: clusterInfo.Name,
Name: taskModel.Name,
Backend: req.Backend,
OperateType: req.OperateType,
CmdScript: req.Parameters["cmdScript"],
WallTime: req.Parameters["wallTime"],
AppType: req.Parameters["appType"],
AppName: req.App,
Queue: req.Parameters["queue"],
SubmitType: req.Parameters["submitType"],
NNode: req.Parameters["nNode"],
Account: clusterInfo.Username,
StdInput: req.Parameters["stdInput"],
Partition: req.Parameters["partition"],
CreatedTime: time.Now(),
UpdatedTime: time.Now(),
Status: constants.StatusDeploying,
UserId: userID,
Params: paramsJSON,
Script: jobScript,
JobId: jobId,
WorkDir: workDir,
ResourceSpec: resource,
} }


if err = tx.Table("task_hpc").Create(&hpcTask).Error; err != nil { if err = tx.Table("task_hpc").Create(&hpcTask).Error; err != nil {


+ 8
- 7
internal/logic/schedule/schedulecreatetasklogic.go View File

@@ -3,6 +3,10 @@ package schedule
import ( import (
"context" "context"
"fmt" "fmt"
"slices"
"strings"
"time"

"github.com/pkg/errors" "github.com/pkg/errors"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/common" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/common"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
@@ -12,9 +16,6 @@ import (
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"gopkg.in/yaml.v3" "gopkg.in/yaml.v3"
"slices"
"strings"
"time"


"github.com/zeromicro/go-zero/core/logx" "github.com/zeromicro/go-zero/core/logx"
) )
@@ -167,7 +168,7 @@ func (l *ScheduleCreateTaskLogic) ScheduleCreateTask(req *types.CreateTaskReq) (
// filter data distribution // filter data distribution
clustersWithDataDistributes := generateFilteredDataDistributes(assignedClusters, req.DataDistributes) clustersWithDataDistributes := generateFilteredDataDistributes(assignedClusters, req.DataDistributes)


taskId, err := l.createTask(taskName, req.Description, req.UserId, req.JobResources.ScheduleStrategy, clustersWithDataDistributes, req.Token, req.UserIp)
taskId, err := l.createTask(taskName, req.Description, req.UserId, req.JobResources.ScheduleStrategy, clustersWithDataDistributes, req.Token, req.UserIp, req.UserName)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@@ -198,7 +199,7 @@ func (l *ScheduleCreateTaskLogic) ScheduleCreateTask(req *types.CreateTaskReq) (
// filter data distribution // filter data distribution
clustersWithDataDistributes := generateFilteredDataDistributes(assignedClusters, req.DataDistributes) clustersWithDataDistributes := generateFilteredDataDistributes(assignedClusters, req.DataDistributes)


taskId, err := l.createTask(taskName, req.Description, req.UserId, req.JobResources.ScheduleStrategy, clustersWithDataDistributes, req.Token, req.UserIp)
taskId, err := l.createTask(taskName, req.Description, req.UserId, req.JobResources.ScheduleStrategy, clustersWithDataDistributes, req.Token, req.UserIp, req.UserName)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@@ -262,7 +263,7 @@ func (l *ScheduleCreateTaskLogic) getAssignedClustersByStrategy(resources *types
return assignedClusters, nil return assignedClusters, nil
} }


func (l *ScheduleCreateTaskLogic) createTask(taskName string, desc string, userId int64, strategyName string, clustersWithDataDistributes *ClustersWithDataDistributes, token string, userIp string) (int64, error) {
func (l *ScheduleCreateTaskLogic) createTask(taskName string, desc string, userId int64, strategyName string, clustersWithDataDistributes *ClustersWithDataDistributes, token string, userIp string, userName string) (int64, error) {
var synergyStatus int64 var synergyStatus int64
if len(clustersWithDataDistributes.Clusters) > 1 { if len(clustersWithDataDistributes.Clusters) > 1 {
synergyStatus = 1 synergyStatus = 1
@@ -273,7 +274,7 @@ func (l *ScheduleCreateTaskLogic) createTask(taskName string, desc string, userI
fmt.Printf("Error while Marshaling. %v", err) fmt.Printf("Error while Marshaling. %v", err)
} }


taskId, err := l.svcCtx.Scheduler.CreateTask(taskName, desc, userId, synergyStatus, strategyName, string(y), token, userIp, &l.svcCtx.Config)
taskId, err := l.svcCtx.Scheduler.CreateTask(taskName, desc, userId, synergyStatus, strategyName, string(y), token, userIp, &l.svcCtx.Config, userName)
if err != nil { if err != nil {
return 0, err return 0, err
} }


+ 4
- 3
internal/logic/schedule/schedulesubmitlogic.go View File

@@ -2,14 +2,15 @@ package schedule


import ( import (
"context" "context"
"strconv"
"strings"

"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/executor" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/executor"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"strconv"
"strings"


"github.com/zeromicro/go-zero/core/logx" "github.com/zeromicro/go-zero/core/logx"
) )
@@ -65,7 +66,7 @@ func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *type
synergystatus = 1 synergystatus = 1
} }


taskId, err := l.svcCtx.Scheduler.CreateTask(req.AiOption.TaskName, "", 0, synergystatus, req.AiOption.Strategy, "", req.Token, "", &l.svcCtx.Config)
taskId, err := l.svcCtx.Scheduler.CreateTask(req.AiOption.TaskName, "", 0, synergystatus, req.AiOption.Strategy, "", req.Token, "", &l.svcCtx.Config, "")
if err != nil { if err != nil {
return nil, err return nil, err
} }


+ 275
- 0
internal/logic/xjlab/pagelisttasklogic.go View File

@@ -0,0 +1,275 @@
package xjlab

import (
"context"
"time"

jsoniter "github.com/json-iterator/go"
"github.com/zeromicro/go-zero/core/logx"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils/timeutils"
"gorm.io/gorm"
)

type PageListTaskLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
}

func NewPageListTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext) *PageListTaskLogic {
return &PageListTaskLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
}
}

type TaskResp struct {
Id int64 `json:"id,omitempty,string" db:"id"` // id
Name string `json:"name,omitempty" db:"name"` // 作业名称
Description string `json:"description,omitempty" db:"description"` // 作业描述
Status string `json:"status,omitempty" db:"status"` // 作业状态
Strategy int64 `json:"strategy" db:"strategy"` // 策略
SynergyStatus int64 `json:"synergyStatus" db:"synergy_status"` // 协同状态(0-未协同、1-已协同)
CommitTime string `json:"commitTime,omitempty" db:"commit_time"` // 提交时间
StartTime string `json:"startTime,omitempty" db:"start_time"` // 开始时间
EndTime string `json:"endTime,omitempty" db:"end_time"` // 结束运行时间
RunningTime int64 `json:"runningTime" db:"running_time"` // 已运行时间(单位秒)
YamlString string `json:"yamlString,omitempty" db:"yaml_string"`
Result string `json:"result,omitempty" db:"result"` // 作业结果
DeletedAt string `json:"deletedAt,omitempty" gorm:"index" db:"deleted_at"`
NsID string `json:"nsId,omitempty" db:"ns_id"`
TenantId string `json:"tenantId,omitempty" db:"tenant_id"`
CreatedTime string `json:"createdTime,omitempty" db:"created_time" gorm:"autoCreateTime"`
UpdatedTime string `json:"updatedTime,omitempty" db:"updated_time"`
AdapterTypeDict string `json:"adapterTypeDict" db:"adapter_type_dict" gorm:"adapter_type_dict"` //适配器类型(对应字典表的值
TaskTypeDict string `json:"taskTypeDict" db:"task_type_dict" gorm:"task_type_dict"` //任务类型(对应字典表的值
UserId int64 `json:"userId,omitempty" db:"user_id"`
UserName string `json:"userName,omitempty" db:"user_name"`

ClusterId string `json:"clusterId,omitempty" db:"cluster_id"`
ClusterName string `json:"clusterName,omitempty" db:"cluster_name"`
ResourceSpec string `json:"resourceSpec,omitempty" db:"resource_spec"`
Card string `json:"card,omitempty" db:"card"`
}

// clusterInfo 集群信息结构体
type clusterInfo struct {
ClusterId string `json:"cluster_id"`
ClusterName string `json:"cluster_name"`
ResourceSpec string `json:"resource_spec"`
Card string `json:"card"`
}

func (l *PageListTaskLogic) PageListTask(req *types.XJLABTaskReq) (*types.PageResult, error) {
// 验证请求参数
if err := l.validateRequest(req); err != nil {
return nil, err
}

// 查询任务总数
total, err := l.getTaskCount(req)
if err != nil {
return nil, err
}

// 查询任务列表
tasks, err := l.getTaskList(req)
if err != nil {
return nil, err
}

// 异步更新任务状态
l.updateTaskStatusAsync(tasks)

// 处理任务响应数据
taskResps := l.processTaskResponses(tasks)

// 构建分页结果
return &types.PageResult{
List: &taskResps,
PageSize: req.PageSize,
PageNum: req.PageNum,
Total: total,
}, nil
}

// validateRequest 验证请求参数
func (l *PageListTaskLogic) validateRequest(req *types.XJLABTaskReq) error {
if req.PageSize <= 0 || req.PageNum <= 0 {
return result.NewDefaultError("Invalid page size or page number")
}
return nil
}

// buildBaseQuery 构建基础查询条件
func (l *PageListTaskLogic) buildBaseQuery(req *types.XJLABTaskReq) *gorm.DB {
db := l.svcCtx.DbEngin.Model(&types.TaskModel{}).Table("task").Where("deleted_at is null")

// 用户权限过滤
if req.UserName != "" && req.UserName != "admin" {
db = db.Where("user_name = ?", req.UserName)
}

// 任务名称模糊查询
if req.Name != "" {
db = db.Where("name LIKE ?", "%"+req.Name+"%")
}

// 计算类型筛选
if req.AdapterTypeDict != "" {
db = db.Where("adapter_type_dict = ?", req.AdapterTypeDict)
}

// 任务状态筛选
if req.Status != "" {
db = db.Where("status = ?", req.Status)
}

// 时间范围筛选
if req.StartTime != "" && req.EndTime != "" {
db = db.Where("created_time BETWEEN ? AND ?", req.StartTime, req.EndTime)
}

return db
}

// getTaskCount 获取任务总数
func (l *PageListTaskLogic) getTaskCount(req *types.XJLABTaskReq) (int64, error) {
var total int64
db := l.buildBaseQuery(req)

if err := db.Count(&total).Error; err != nil {
return 0, result.NewDefaultError(err.Error())
}

return total, nil
}

// getTaskList 获取任务列表
func (l *PageListTaskLogic) getTaskList(req *types.XJLABTaskReq) ([]*types.TaskModel, error) {
var list []*types.TaskModel

limit := req.PageSize
offset := req.PageSize * (req.PageNum - 1)

db := l.buildBaseQuery(req)

if err := db.Limit(limit).Offset(offset).Order("created_time desc").Find(&list).Error; err != nil {
return nil, result.NewDefaultError(err.Error())
}

return list, nil
}

// updateTaskStatusAsync 异步更新任务状态
func (l *PageListTaskLogic) updateTaskStatusAsync(tasks []*types.TaskModel) {
go l.svcCtx.Scheduler.AiService.St.UpdateTaskStatus(tasks)
go l.svcCtx.Scheduler.AiService.St.UpdateAiTaskStatus(tasks)
}

// processTaskResponses 处理任务响应数据
func (l *PageListTaskLogic) processTaskResponses(tasks []*types.TaskModel) []*TaskResp {
taskResps := make([]*TaskResp, 0, len(tasks))

for _, model := range tasks {
// 计算运行时间
model.RunningTime = l.calculateRunningTime(model.StartTime, model.EndTime)

// 转换为响应结构体
taskResp, err := l.convertToTaskResp(model)
if err != nil {
l.Errorf("Failed to convert task model: %v", err)
continue
}

// 丰富任务详情
enrichedTask := l.enrichTaskDetails(taskResp)
taskResps = append(taskResps, &enrichedTask)
}

return taskResps
}

// convertToTaskResp 将TaskModel转换为TaskResp
func (l *PageListTaskLogic) convertToTaskResp(model *types.TaskModel) (*TaskResp, error) {
jsonData, err := jsoniter.Marshal(model)
if err != nil {
return nil, err
}

var taskResp TaskResp
if err := jsoniter.Unmarshal(jsonData, &taskResp); err != nil {
return nil, err
}

return &taskResp, nil
}

// calculateRunningTime 计算任务的运行时间
func (l *PageListTaskLogic) calculateRunningTime(startTimeStr, endTimeStr string) int64 {
if startTimeStr == "" {
return 0
}

startTime := timeutils.TimeStringToGoTime(startTimeStr)

// 如果没有结束时间,计算到当前时间
if endTimeStr == "" {
return int64(time.Since(startTime).Seconds())
}

endTime := timeutils.TimeStringToGoTime(endTimeStr)
return int64(endTime.Sub(startTime).Seconds())
}

// enrichTaskDetails 丰富任务详情
func (l *PageListTaskLogic) enrichTaskDetails(task *TaskResp) TaskResp {
cluster := l.getClusterInfo(task.Id, task.AdapterTypeDict)

task.ClusterId = cluster.ClusterId
task.ClusterName = cluster.ClusterName
task.ResourceSpec = cluster.ResourceSpec
task.Card = cluster.Card

return *task
}

// getClusterInfo 根据适配器类型获取集群信息
func (l *PageListTaskLogic) getClusterInfo(taskId int64, adapterType string) clusterInfo {
var cluster clusterInfo

switch adapterType {
case constants.AdapterTypeCloud:
// 云计算任务
l.svcCtx.DbEngin.Table("task_cloud").
Where("task_id = ?", taskId).
Select("cluster_id,cluster_name,resource_spec").
Find(&cluster)

case constants.AdapterTypeAI:
// AI计算任务
l.svcCtx.DbEngin.Table("task_ai").
Where("task_id = ?", taskId).
Select("cluster_id,cluster_name,resource_spec,card").
Find(&cluster)

// AI任务特殊处理:如果没有resource_spec,使用card字段
if cluster.ResourceSpec == "" {
cluster.ResourceSpec = cluster.Card
}

case constants.AdapterTypeHPC:
// 高性能计算任务
l.svcCtx.DbEngin.Table("task_hpc").
Where("task_id = ?", taskId).
Select("cluster_id,cluster_name,resource_spec").
Find(&cluster)
}

return cluster
}

+ 27
- 0
internal/logic/xjlab/task_resource_usage.go View File

@@ -0,0 +1,27 @@
package xjlab

import (
"context"

"github.com/zeromicro/go-zero/core/logx"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
)

type TaskResourceUsageLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
}

func NewTaskResourceUsageLogic(ctx context.Context, svcCtx *svc.ServiceContext) *TaskResourceUsageLogic {
return &TaskResourceUsageLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
}
}

func (l *TaskResourceUsageLogic) TaskResourceUsage(req *types.FId) (*types.CommonResp, error) {
return nil, nil
}

+ 155
- 0
internal/logic/xjlab/task_status_statistics.go View File

@@ -0,0 +1,155 @@
package xjlab

import (
"context"

"github.com/zeromicro/go-zero/core/logx"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
)

// 任务状态常量定义
const (
StatusCompleted = "Completed"
StatusFailed = "Failed"
StatusRunning = "Running"
StatusSaved = "Saved"
StatusStopped = "Stopped"
StatusSucceeded = "Succeeded"
StatusUndefined = "undefined"
StatusWaiting = "Waiting"
StatusWaitRestart = "WaitRestart"
)

type TaskStatusStatistics struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
}

func NewTaskStatusStatisticsLogic(ctx context.Context, svcCtx *svc.ServiceContext) *TaskStatusStatistics {
return &TaskStatusStatistics{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
}
}

// TaskStatus 任务状态统计结构体
type TaskStatus struct {
// 总数
Total int64 `json:"total"`

// 按状态分类统计
Completed int64 `json:"completed"` // 已完成
Failed int64 `json:"failed"` // 失败
Running int64 `json:"running"` // 运行中
Saved int64 `json:"saved"` // 已保存
Stopped int64 `json:"stopped"` // 已停止
Succeeded int64 `json:"succeeded"` // 成功
Undefined int64 `json:"undefined"` // 未定义
Waiting int64 `json:"waiting"` // 等待中
WaitRestart int64 `json:"waitRestart"` // 等待重启

// 业务分类统计
NormalCount int64 `json:"normalCount"` // 正常任务数 (Completed + Succeeded + Saved)
RunningCount int64 `json:"runningCount"` // 运行中任务数 (Running + Waiting + WaitRestart)
ErrorCount int64 `json:"errorCount"` // 异常任务数 (Failed + Stopped + Undefined)
}

// StatusResult 数据库查询结果结构体
type StatusResult struct {
Status string `gorm:"column:status" json:"status"`
Count int64 `gorm:"column:count" json:"count"`
}

// GetTaskStatusStatistics 获取任务状态统计
func (l *TaskStatusStatistics) GetTaskStatusStatistics(req *types.XJLABCommonReq) (*TaskStatus, error) {
// 构建数据库查询
db := l.svcCtx.DbEngin.Model(&types.TaskModel{}).
Table("task").
Where("deleted_at IS NULL")

// 查询状态统计
var results []StatusResult
err := db.
Select("status, COUNT(*) as count").
Group("status").
Find(&results).Error

if err != nil {
l.Errorf("Failed to query task status statistics: %v", err)
return nil, result.NewDefaultError("Failed to get task status statistics")
}

// 初始化统计结构
stats := &TaskStatus{}

// 填充统计数据
for _, r := range results {
stats.Total += r.Count
l.categorizeTaskStatus(stats, r.Status, r.Count)
}

l.Infof("Task status statistics retrieved successfully. Total: %d, Normal: %d, Running: %d, Error: %d",
stats.Total, stats.NormalCount, stats.RunningCount, stats.ErrorCount)

return stats, nil
}

// categorizeTaskStatus 根据状态分类统计数据
func (l *TaskStatusStatistics) categorizeTaskStatus(stats *TaskStatus, status string, count int64) {
switch status {
case StatusCompleted:
stats.Completed = count
stats.NormalCount += count
case StatusFailed:
stats.Failed = count
stats.ErrorCount += count
case StatusRunning:
stats.Running = count
stats.RunningCount += count
case StatusSaved:
stats.Saved = count
stats.NormalCount += count
case StatusStopped:
stats.Stopped = count
stats.ErrorCount += count
case StatusSucceeded:
stats.Succeeded = count
stats.NormalCount += count
case StatusUndefined:
stats.Undefined = count
stats.ErrorCount += count
case StatusWaiting:
stats.Waiting = count
stats.RunningCount += count
case StatusWaitRestart:
stats.WaitRestart = count
stats.RunningCount += count
default:
// 记录未知状态
l.Logger.Errorf("Unknown task status encountered: %s with count: %d", status, count)
stats.ErrorCount += count
}
}

// GetSimpleTaskStatistics 获取简化的任务统计数据
func (l *TaskStatusStatistics) GetSimpleTaskStatistics(req *types.XJLABCommonReq) (*map[string]interface{}, error) {
// 获取完整统计
resp, err := l.GetTaskStatusStatistics(req)
if err != nil {
return nil, err
}

// 返回简化的统计数据,对应前端UI的三个卡片
simpleStats := map[string]interface{}{
"totalCount": resp.Total, // 任务总数
"normalCount": resp.NormalCount, // 正常任务数
"errorCount": resp.ErrorCount, // 任务告警数
"runningCount": resp.RunningCount, // 运行中任务数
}

return &simpleStats, nil
}

+ 25
- 21
internal/scheduler/database/aiStorage.go View File

@@ -1,6 +1,10 @@
package database package database


import ( import (
"strconv"
"time"

jsoniter "github.com/json-iterator/go"
"github.com/zeromicro/go-zero/core/logx" "github.com/zeromicro/go-zero/core/logx"
clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/client" clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/client"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
@@ -11,8 +15,6 @@ import (
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/tracker" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/tracker"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
"gorm.io/gorm" "gorm.io/gorm"
"strconv"
"time"
) )


type AiStorage struct { type AiStorage struct {
@@ -125,7 +127,7 @@ func (s *AiStorage) DoesTaskNameExist(name string, taskType string) (bool, error
return total > 0, nil return total > 0, nil
} }


func (s *AiStorage) SaveTask(name string, desc string, userId int64, strategyCode int64, synergyStatus int64, aiType string, yaml string, saveToChain func(task models.Task, id int64) error) (int64, error) {
func (s *AiStorage) SaveTask(name string, desc string, userId int64, strategyCode int64, synergyStatus int64, aiType string, yaml string, saveToChain func(task models.Task, id int64) error, userName string) (int64, error) {
startTime := time.Now() startTime := time.Now()


// 构建主任务结构体 // 构建主任务结构体
@@ -135,6 +137,7 @@ func (s *AiStorage) SaveTask(name string, desc string, userId int64, strategyCod
Description: desc, Description: desc,
Name: name, Name: name,
UserId: userId, UserId: userId,
UserName: userName,
SynergyStatus: synergyStatus, SynergyStatus: synergyStatus,
Strategy: strategyCode, Strategy: strategyCode,
AdapterTypeDict: "1", AdapterTypeDict: "1",
@@ -214,25 +217,26 @@ func (s *AiStorage) SaveAiTask(taskId int64, opt option.Option, adapterName stri
if err != nil { if err != nil {
return err return err
} }
resourceSpec, _ := jsoniter.MarshalToString(aiOpt.ResourcesRequired)
aiTaskModel := models.TaskAi{ aiTaskModel := models.TaskAi{
TaskId: taskId,
AdapterId: aId,
AdapterName: adapterName,
ClusterId: cId,
ClusterName: clusterName,
Name: aiOpt.TaskName,
Replica: int64(aiOpt.Replica),
JobId: jobId,
TaskType: aiOpt.TaskType,
ModelName: aiOpt.ModelName,
Strategy: aiOpt.StrategyName,
Status: status,
Msg: msg,
Output: aiOpt.Output,
Card: aiOpt.ComputeCard,
StartTime: time.Now().Format(time.RFC3339),
CommitTime: time.Now(),
TaskId: taskId,
AdapterId: aId,
AdapterName: adapterName,
ClusterId: cId,
ClusterName: clusterName,
Name: aiOpt.TaskName,
Replica: int64(aiOpt.Replica),
JobId: jobId,
TaskType: aiOpt.TaskType,
ModelName: aiOpt.ModelName,
Strategy: aiOpt.StrategyName,
Status: status,
Msg: msg,
Output: aiOpt.Output,
Card: aiOpt.ComputeCard,
StartTime: time.Now().Format(time.RFC3339),
CommitTime: time.Now(),
ResourceSpec: resourceSpec,
} }
// 保存任务数据到数据库 // 保存任务数据到数据库
tx := s.DbEngin.Create(&aiTaskModel) tx := s.DbEngin.Create(&aiTaskModel)


+ 4
- 3
internal/scheduler/scheduler.go View File

@@ -16,6 +16,8 @@ package scheduler


import ( import (
"encoding/json" "encoding/json"
"strings"

"github.com/pkg/errors" "github.com/pkg/errors"
"github.com/zeromicro/go-zero/core/logx" "github.com/zeromicro/go-zero/core/logx"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/config" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/config"
@@ -27,7 +29,6 @@ import (
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/response" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/response"
"gorm.io/gorm" "gorm.io/gorm"
"sigs.k8s.io/yaml" "sigs.k8s.io/yaml"
"strings"
) )


type Scheduler struct { type Scheduler struct {
@@ -189,13 +190,13 @@ func (s *Scheduler) SaveToDb() error {
return nil return nil
} }


func (s *Scheduler) CreateTask(taskName string, desc string, userId int64, synergyCode int64, strategyName string, yaml string, token string, userIp string, config *config.Config) (int64, error) {
func (s *Scheduler) CreateTask(taskName string, desc string, userId int64, synergyCode int64, strategyName string, yaml string, token string, userIp string, config *config.Config, userName string) (int64, error) {
strategyCode, err := s.AiStorages.GetStrategyCode(strategyName) strategyCode, err := s.AiStorages.GetStrategyCode(strategyName)
if err != nil { if err != nil {
return 0, err return 0, err
} }


id, err := s.AiStorages.SaveTask(taskName, desc, userId, strategyCode, synergyCode, "10", yaml, nil)
id, err := s.AiStorages.SaveTask(taskName, desc, userId, strategyCode, synergyCode, "10", yaml, nil, userName)
if err != nil { if err != nil {
return 0, err return 0, err
} }


+ 5
- 4
internal/scheduler/schedulers/aiScheduler.go View File

@@ -19,6 +19,10 @@ import (
"encoding/json" "encoding/json"
"errors" "errors"
"fmt" "fmt"
"strconv"
"strings"
"sync"

"github.com/zeromicro/go-zero/core/logx" "github.com/zeromicro/go-zero/core/logx"
"gitlink.org.cn/JointCloud/pcm-ac/hpcAC" "gitlink.org.cn/JointCloud/pcm-ac/hpcAC"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler"
@@ -39,9 +43,6 @@ import (
omodel "gitlink.org.cn/JointCloud/pcm-octopus/http/model" omodel "gitlink.org.cn/JointCloud/pcm-octopus/http/model"
"gitlink.org.cn/JointCloud/pcm-octopus/octopus" "gitlink.org.cn/JointCloud/pcm-octopus/octopus"
"gitlink.org.cn/JointCloud/pcm-openi/model" "gitlink.org.cn/JointCloud/pcm-openi/model"
"strconv"
"strings"
"sync"
) )


type AiScheduler struct { type AiScheduler struct {
@@ -248,7 +249,7 @@ func (as *AiScheduler) handleErrors(errs []interface{}, clusters []*strategy.Ass
var taskId int64 var taskId int64
switch mode { switch mode {
case executor.SUBMIT_MODE_JOINT_CLOUD: case executor.SUBMIT_MODE_JOINT_CLOUD:
tid, err := as.CreateTask(as.option.TaskName, "", 0, synergystatus, as.option.StrategyName, "", "", "", nil)
tid, err := as.CreateTask(as.option.TaskName, "", 0, synergystatus, as.option.StrategyName, "", "", "", nil, "0")
if err != nil { if err != nil {
return err return err
} }


+ 10
- 9
internal/scheduler/service/inference/imageInference/imageInference.go View File

@@ -3,6 +3,15 @@ package imageInference
import ( import (
"encoding/json" "encoding/json"
"errors" "errors"
"log"
"math/rand"
"mime/multipart"
"net/http"
"sort"
"strconv"
"sync"
"time"

"github.com/zeromicro/go-zero/core/logx" "github.com/zeromicro/go-zero/core/logx"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/common" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/common"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/database" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/database"
@@ -14,14 +23,6 @@ import (
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
"log"
"math/rand"
"mime/multipart"
"net/http"
"sort"
"strconv"
"sync"
"time"
) )


type IImageInference interface { type IImageInference interface {
@@ -126,7 +127,7 @@ func (i *ImageInference) saveTask() (int64, error) {
return 0, err return 0, err
} }


id, err := i.storage.SaveTask(i.opt.TaskName, "", 0, strategyCode, synergystatus, i.inference.GetAiType(), "", nil)
id, err := i.storage.SaveTask(i.opt.TaskName, "", 0, strategyCode, synergystatus, i.inference.GetAiType(), "", nil, "0")
if err != nil { if err != nil {
return 0, err return 0, err
} }


+ 1
- 1
internal/scheduler/service/inference/textInference/textInference.go View File

@@ -70,7 +70,7 @@ func (ti *TextInference) saveTask() (int64, error) {
var synergystatus int64 var synergystatus int64
var strategyCode int64 var strategyCode int64


id, err := ti.storage.SaveTask(ti.opt.TaskName, "", 0, strategyCode, synergystatus, ti.inference.GetAiType(), "", nil)
id, err := ti.storage.SaveTask(ti.opt.TaskName, "", 0, strategyCode, synergystatus, ti.inference.GetAiType(), "", nil, "0")
if err != nil { if err != nil {
return 0, err return 0, err
} }


+ 1
- 0
internal/types/cloud/container.go View File

@@ -14,6 +14,7 @@ type CreateParam struct {
ContainerGroupName string `json:"containerGroupName"` ContainerGroupName string `json:"containerGroupName"`
Description string `json:"description,omitempty"` Description string `json:"description,omitempty"`
UserId int64 `json:"userId"` UserId int64 `json:"userId"`
UserName string `json:"userName"`
Name string `json:"name"` Name string `json:"name"`
Image string `json:"image"` Image string `json:"image"`
ImageRegistry string `json:"imageRegistry,omitempty"` ImageRegistry string `json:"imageRegistry,omitempty"`


+ 52
- 0
internal/types/types.go View File

@@ -1465,6 +1465,7 @@ type CreateSubnetResp struct {
type CreateTaskReq struct { type CreateTaskReq struct {
Name string `json:"name"` Name string `json:"name"`
UserId int64 `json:"userId,optional"` UserId int64 `json:"userId,optional"`
UserName string `json:"userName,optional"`
Description string `json:"description,optional"` Description string `json:"description,optional"`
Token string `json:"token,optional"` Token string `json:"token,optional"`
UserIp string `json:"userIp,optional"` UserIp string `json:"userIp,optional"`
@@ -2500,6 +2501,7 @@ type Floatingips struct {
type GeneralTaskReq struct { type GeneralTaskReq struct {
Token string `json:"token,optional"` Token string `json:"token,optional"`
UserId int64 `json:"userId,optional"` UserId int64 `json:"userId,optional"`
UserName string `json:"userName,optional"`
Name string `json:"name"` Name string `json:"name"`
AdapterIds []string `json:"adapterIds"` AdapterIds []string `json:"adapterIds"`
ClusterIds []string `json:"clusterIds"` ClusterIds []string `json:"clusterIds"`
@@ -5647,6 +5649,7 @@ type TaskModel struct {
AdapterTypeDict string `json:"adapterTypeDict" db:"adapter_type_dict" gorm:"adapter_type_dict"` //适配器类型(对应字典表的值 AdapterTypeDict string `json:"adapterTypeDict" db:"adapter_type_dict" gorm:"adapter_type_dict"` //适配器类型(对应字典表的值
TaskTypeDict string `json:"taskTypeDict" db:"task_type_dict" gorm:"task_type_dict"` //任务类型(对应字典表的值 TaskTypeDict string `json:"taskTypeDict" db:"task_type_dict" gorm:"task_type_dict"` //任务类型(对应字典表的值
UserId int64 `json:"userId,omitempty" db:"user_id"` UserId int64 `json:"userId,omitempty" db:"user_id"`
UserName string `json:"userName,omitempty" db:"user_name"`
} }


type TaskSl struct { type TaskSl struct {
@@ -6537,3 +6540,52 @@ type TaskNumResp struct {
History int `json:"history"` History int `json:"history"`
Failed int `json:"failed"` Failed int `json:"failed"`
} }

// jcc 用户信息
type JccUserInfo struct {
UserName string `json:"userName,optional"`
UserId int64 `json:"userId,optional"`
}

type XJLABTaskReq struct {
Id string `form:"id,optional"`
Name string `form:"name,optional"`
Status string `form:"status,optional"`
UserName string `form:"userName,optional"`
AdapterTypeDict string `form:"adapterTypeDict,optional"`
StartTime string `form:"startTime,optional"`
EndTime string `form:"endTime,optional"`
PageInfo
JccUserInfo
}

type XJLABTaskResp struct {
Id int64 `json:"id,omitempty,string" db:"id"` // id
Name string `json:"name,omitempty" db:"name"` // 作业名称
Description string `json:"description,omitempty" db:"description"` // 作业描述
Status string `json:"status,omitempty" db:"status"` // 作业状态
Strategy int64 `json:"strategy" db:"strategy"` // 策略
SynergyStatus int64 `json:"synergyStatus" db:"synergy_status"` // 协同状态(0-未协同、1-已协同)
CommitTime string `json:"commitTime,omitempty" db:"commit_time"` // 提交时间
StartTime string `json:"startTime,omitempty" db:"start_time"` // 开始时间
EndTime string `json:"endTime,omitempty" db:"end_time"` // 结束运行时间
RunningTime int64 `json:"runningTime" db:"running_time"` // 已运行时间(单位秒)
YamlString string `json:"yamlString,omitempty" db:"yaml_string"`
Result string `json:"result,omitempty" db:"result"` // 作业结果
DeletedAt string `json:"deletedAt,omitempty" gorm:"index" db:"deleted_at"`
NsID string `json:"nsId,omitempty" db:"ns_id"`
TenantId string `json:"tenantId,omitempty" db:"tenant_id"`
CreatedTime string `json:"createdTime,omitempty" db:"created_time" gorm:"autoCreateTime"`
UpdatedTime string `json:"updatedTime,omitempty" db:"updated_time"`
AdapterTypeDict string `json:"adapterTypeDict" db:"adapter_type_dict" gorm:"adapter_type_dict"` //适配器类型(对应字典表的值
TaskTypeDict string `json:"taskTypeDict" db:"task_type_dict" gorm:"task_type_dict"` //任务类型(对应字典表的值
UserId int64 `json:"userId,omitempty" db:"user_id"`
UserName string `json:"userName,omitempty" db:"-"` // 用户名称
ResourceId string `json:"resourceId,omitempty" db:"resource_id"` // 资源ID
ResourceName string `json:"resourceName,omitempty" db:"resource_name"` // 资源名称
ClusterId string `json:"clusterId,omitempty" db:"cluster_id"` // 集群ID
ClusterName string `json:"clusterName,omitempty" db:"cluster_name"` // 集群
}

type XJLABCommonReq struct {
}

+ 24
- 0
pkg/constants/const.go View File

@@ -0,0 +1,24 @@
package constants

const (
UserId = "UserId"
UserName = "UserName"
)

const (
StatusSaved = "Saved"
StatusDeploying = "Deploying"
)
const (
AdapterTypeCloud = "0"
AdapterTypeAI = "1"
AdapterTypeHPC = "2"
)

// HPCSystemType 超算计算系统类型
type HPCSystemType string

const (
HPC_SYSTEM_SLURM HPCSystemType = "slurm"
HPC_SYSTEM_AC HPCSystemType = "ac"
)

+ 22
- 21
pkg/models/taskaimodel_gen.go View File

@@ -35,27 +35,28 @@ type (
} }


TaskAi struct { TaskAi struct {
Id int64 `db:"id"` // id
TaskId int64 `db:"task_id"` // 任务id
AdapterId int64 `db:"adapter_id"` // 适配器id
AdapterName string `db:"adapter_name"` // 适配器名称
ClusterId int64 `db:"cluster_id"` // 集群id
ClusterName string `db:"cluster_name"` // 集群名称
Name string `db:"name"` // 任务名
Replica int64 `db:"replica"` // 执行数
JobId string `db:"job_id"` // 集群返回任务id
Strategy string `db:"strategy"` // 主任务使用策略
Status string `db:"status"` // 任务状态
Msg string `db:"msg"` // 集群返回任务信息
CommitTime time.Time `db:"commit_time"` // 提交时间
StartTime string `db:"start_time"` // 开始时间
EndTime string `db:"end_time"` // 结束时间
TaskType string `db:"task_type"`
DeletedAt *time.Time `db:"deleted_at"`
Card string `db:"card"`
InferUrl string `db:"infer_url"`
ModelName string `db:"model_name"`
Output string `db:"output"`
Id int64 `db:"id"` // id
TaskId int64 `db:"task_id"` // 任务id
AdapterId int64 `db:"adapter_id"` // 适配器id
AdapterName string `db:"adapter_name"` // 适配器名称
ClusterId int64 `db:"cluster_id"` // 集群id
ClusterName string `db:"cluster_name"` // 集群名称
Name string `db:"name"` // 任务名
Replica int64 `db:"replica"` // 执行数
JobId string `db:"job_id"` // 集群返回任务id
Strategy string `db:"strategy"` // 主任务使用策略
Status string `db:"status"` // 任务状态
Msg string `db:"msg"` // 集群返回任务信息
CommitTime time.Time `db:"commit_time"` // 提交时间
StartTime string `db:"start_time"` // 开始时间
EndTime string `db:"end_time"` // 结束时间
TaskType string `db:"task_type"`
DeletedAt *time.Time `db:"deleted_at"`
Card string `db:"card"`
InferUrl string `db:"infer_url"`
ModelName string `db:"model_name"`
Output string `db:"output"`
ResourceSpec string `db:"resource_spec"`
} }
) )




+ 58
- 49
pkg/models/taskhpcmodel_gen.go View File

@@ -36,55 +36,64 @@ type (
} }


TaskHpc struct { TaskHpc struct {
Id int64 `db:"id"` // id
TaskId int64 `db:"task_id"` // 任务id
JobId string `db:"job_id"` // 作业id(在第三方系统中的作业id)
AdapterId int64 `db:"adapter_d"` // 适配器id
AdapterName string `db:"adapter_name"` //适配器名称
ClusterId int64 `db:"cluster_id"` //集群id
ClusterName string `db:"cluster_name"` //集群名称
Name string `db:"name"` // 名称
Backend string `db:"backend"` // 平台类型
OperateType string `db:"operate_type"` // 操作类型
Status string `db:"status"` // 状态
CmdScript string `db:"cmd_script"`
StartTime string `db:"start_time"` // 开始时间
EndTime string `db:"end_time"` // 结束时间
RunningTime int64 `db:"running_time"` // 运行时间
DerivedEs string `db:"derived_es"`
Cluster string `db:"cluster"`
BlockId int64 `db:"block_id"`
AllocNodes int64 `db:"alloc_nodes"`
AllocCpu int64 `db:"alloc_cpu"`
CardCount int64 `db:"card_count"` // 卡数
Version string `db:"version"`
Account string `db:"account"`
WorkDir string `db:"work_dir"` // 工作路径
AssocId int64 `db:"assoc_id"`
ExitCode int64 `db:"exit_code"`
WallTime string `db:"wall_time"` // 最大运行时间
Result string `db:"result"` // 运行结果
DeletedAt sql.NullTime `db:"deleted_at"` // 删除时间
YamlString string `db:"yaml_string"`
AppType string `db:"app_type"` // 应用类型
AppName string `db:"app_name"` // 应用名称
Queue string `db:"queue"` // 队列名称
SubmitType string `db:"submit_type"` // cmd(命令行模式)
NNode string `db:"n_node"` // 节点个数(当指定该参数时,GAP_NODE_STRING必须为"")
StdOutFile string `db:"std_out_file"` // 工作路径/std.err.%j
StdErrFile string `db:"std_err_file"` // 工作路径/std.err.%j
StdInput string `db:"std_input"`
Partition string `db:"partition"`
DeletedFlag int64 `db:"deleted_flag"` // 是否删除(0-否,1-是)
CreatedBy int64 `db:"created_by"` // 创建人
CreatedTime time.Time `db:"created_time"` // 创建时间
UpdatedBy int64 `db:"updated_by"` // 更新人
UpdatedTime time.Time `db:"updated_time"` // 更新时间
UserId int64 `db:"user_id"`
TimeLimit int64 `db:"time_limit"`
Params string `db:"params"` // 渲染参数
Script string `db:"script"` // 生成的脚本
TemplateId int64 `db:"template_id"` // 模板ID
Id int64 `db:"id"` // id
TaskId int64 `db:"task_id"` // 任务id
JobId string `db:"job_id"` // 作业id(在第三方系统中的作业id)
AdapterId int64 `db:"adapter_d"` // 适配器id
AdapterName string `db:"adapter_name"` //适配器名称
ClusterId int64 `db:"cluster_id"` //集群id
ClusterName string `db:"cluster_name"` //集群名称
Name string `db:"name"` // 名称
Backend string `db:"backend"` // 平台类型
OperateType string `db:"operate_type"` // 操作类型
Status string `db:"status"` // 状态
CmdScript string `db:"cmd_script"`
StartTime string `db:"start_time"` // 开始时间
EndTime string `db:"end_time"` // 结束时间
RunningTime int64 `db:"running_time"` // 运行时间
DerivedEs string `db:"derived_es"`
Cluster string `db:"cluster"`
BlockId int64 `db:"block_id"`
AllocNodes int64 `db:"alloc_nodes"`
AllocCpu int64 `db:"alloc_cpu"`
CardCount int64 `db:"card_count"` // 卡数
Version string `db:"version"`
Account string `db:"account"`
WorkDir string `db:"work_dir"` // 工作路径
AssocId int64 `db:"assoc_id"`
ExitCode int64 `db:"exit_code"`
WallTime string `db:"wall_time"` // 最大运行时间
Result string `db:"result"` // 运行结果
DeletedAt sql.NullTime `db:"deleted_at"` // 删除时间
YamlString string `db:"yaml_string"`
AppType string `db:"app_type"` // 应用类型
AppName string `db:"app_name"` // 应用名称
Queue string `db:"queue"` // 队列名称
SubmitType string `db:"submit_type"` // cmd(命令行模式)
NNode string `db:"n_node"` // 节点个数(当指定该参数时,GAP_NODE_STRING必须为"")
StdOutFile string `db:"std_out_file"` // 工作路径/std.err.%j
StdErrFile string `db:"std_err_file"` // 工作路径/std.err.%j
StdInput string `db:"std_input"`
Partition string `db:"partition"`
DeletedFlag int64 `db:"deleted_flag"` // 是否删除(0-否,1-是)
CreatedBy int64 `db:"created_by"` // 创建人
CreatedTime time.Time `db:"created_time"` // 创建时间
UpdatedBy int64 `db:"updated_by"` // 更新人
UpdatedTime time.Time `db:"updated_time"` // 更新时间
UserId int64 `db:"user_id"`
TimeLimit int64 `db:"time_limit"`
Params string `db:"params"` // 渲染参数
Script string `db:"script"` // 生成的脚本
TemplateId int64 `db:"template_id"` // 模板ID
ResourceSpec ResourceSpec `json:"resourceSpec" gorm:"serializer:json"`
}

ResourceSpec struct {
ResourceId string `json:"resourceId"` // 资源ID
ResourceName string `json:"resourceName"` // 资源名称
ResourceType string `json:"resourceType"` // 资源类型
Partition string `json:"partition"` // 分区
Specifications interface{} `json:"specifications"`
} }
) )




+ 1
- 0
pkg/models/taskmodel_gen.go View File

@@ -52,6 +52,7 @@ type (
AdapterTypeDict string `db:"adapter_type_dict" json:"adapterTypeDict"` //任务类型(对应字典表的值) AdapterTypeDict string `db:"adapter_type_dict" json:"adapterTypeDict"` //任务类型(对应字典表的值)
TaskTypeDict string `db:"task_type_dict" json:"taskTypeDict"` TaskTypeDict string `db:"task_type_dict" json:"taskTypeDict"`
UserId int64 `db:"user_id" json:"userId"` UserId int64 `db:"user_id" json:"userId"`
UserName string `db:"user_name" json:"userName"` // 提交人
} }
) )




+ 394
- 0
pkg/utils/slurm_parser.go View File

@@ -0,0 +1,394 @@
package utils

import (
"bufio"
"fmt"
"os"
"regexp"
"strconv"
"strings"
)

// SlurmResource 定义SLURM资源规格结构体
type SlurmResource struct {
JobName string `json:"job_name"`
CPUs string `json:"cpus"` // 每任务CPU数
Memory string `json:"memory"` // 内存
Nodes string `json:"nodes"` // 节点数
Tasks string `json:"tasks"` // 总任务数
TasksPerNode string `json:"tasks_per_node"` // 每节点任务数
CPUsPerTask string `json:"cpus_per_task"` // 每任务CPU数
Partition string `json:"partition"` // 队列/分区
Time string `json:"time"` // 时间限制
Output string `json:"output"` // 输出文件
Error string `json:"error"` // 错误文件
QOS string `json:"qos"` // 服务质量
Account string `json:"account"` // 账户
GPUs string `json:"gpus"` // GPU数量
GPUType string `json:"gpu_type"` // GPU类型
Constraint string `json:"constraint"` // 节点约束
Exclusive bool `json:"exclusive"` // 独占节点
ArrayJobID string `json:"array_job_id"` // 数组作业ID
WorkingDir string `json:"working_dir"` // 工作目录
MailType string `json:"mail_type"` // 邮件类型
MailUser string `json:"mail_user"` // 邮件用户
}

// SlurmParser SLURM解析器
type SlurmParser struct {
patterns map[string][]*regexp.Regexp
}

// NewSlurmParser 创建新的SLURM解析器
func NewSlurmParser() *SlurmParser {
parser := &SlurmParser{
patterns: make(map[string][]*regexp.Regexp),
}
parser.initPatterns()
return parser
}

// initPatterns 初始化所有匹配模式
func (p *SlurmParser) initPatterns() {
// 作业名称的各种写法
p.patterns["job_name"] = []*regexp.Regexp{
regexp.MustCompile(`#SBATCH\s+--job-name[=\s]+([^\s]+)`),
regexp.MustCompile(`#SBATCH\s+-J\s+([^\s]+)`),
regexp.MustCompile(`#SBATCH\s+--job-name\s*=\s*"([^"]+)"`),
regexp.MustCompile(`#SBATCH\s+--job-name\s*=\s*'([^']+)'`),
}

// CPU相关的各种写法
p.patterns["cpus_per_task"] = []*regexp.Regexp{
regexp.MustCompile(`#SBATCH\s+--cpus-per-task[=\s]+(\d+)`),
regexp.MustCompile(`#SBATCH\s+-c\s+(\d+)`),
regexp.MustCompile(`#SBATCH\s+--cpus-per-task\s*=\s*(\d+)`),
}

// 内存的各种写法
p.patterns["memory"] = []*regexp.Regexp{
regexp.MustCompile(`#SBATCH\s+--mem[=\s]+([^\s]+)`),
regexp.MustCompile(`#SBATCH\s+-m\s+([^\s]+)`),
regexp.MustCompile(`#SBATCH\s+--mem\s*=\s*([^\s]+)`),
regexp.MustCompile(`#SBATCH\s+--mem-per-cpu[=\s]+([^\s]+)`),
regexp.MustCompile(`#SBATCH\s+--mem-per-node[=\s]+([^\s]+)`),
}

// 节点数的各种写法
p.patterns["nodes"] = []*regexp.Regexp{
regexp.MustCompile(`#SBATCH\s+--nodes[=\s]+(\d+)`),
regexp.MustCompile(`#SBATCH\s+-N\s+(\d+)`),
regexp.MustCompile(`#SBATCH\s+--nodes\s*=\s*(\d+)`),
regexp.MustCompile(`#SBATCH\s+--nodes[=\s]+(\d+-\d+)`), // 范围格式
}

// 任务数的各种写法
p.patterns["tasks"] = []*regexp.Regexp{
regexp.MustCompile(`#SBATCH\s+--ntasks[=\s]+(\d+)`),
regexp.MustCompile(`#SBATCH\s+-n\s+(\d+)`),
regexp.MustCompile(`#SBATCH\s+--ntasks\s*=\s*(\d+)`),
}

// 每节点任务数
p.patterns["tasks_per_node"] = []*regexp.Regexp{
regexp.MustCompile(`#SBATCH\s+--ntasks-per-node[=\s]+(\d+)`),
regexp.MustCompile(`#SBATCH\s+--ntasks-per-node\s*=\s*(\d+)`),
}

// 分区/队列的各种写法
p.patterns["partition"] = []*regexp.Regexp{
regexp.MustCompile(`#SBATCH\s+--partition[=\s]+([^\s]+)`),
regexp.MustCompile(`#SBATCH\s+-p\s+([^\s]+)`),
regexp.MustCompile(`#SBATCH\s+--partition\s*=\s*([^\s]+)`),
regexp.MustCompile(`#SBATCH\s+--partition\s*=\s*"([^"]+)"`),
}

// 时间限制的各种写法
p.patterns["time"] = []*regexp.Regexp{
regexp.MustCompile(`#SBATCH\s+--time[=\s]+([^\s]+)`),
regexp.MustCompile(`#SBATCH\s+-t\s+([^\s]+)`),
regexp.MustCompile(`#SBATCH\s+--time\s*=\s*([^\s]+)`),
}

// 输出文件
p.patterns["output"] = []*regexp.Regexp{
regexp.MustCompile(`#SBATCH\s+--output[=\s]+([^\s]+)`),
regexp.MustCompile(`#SBATCH\s+-o\s+([^\s]+)`),
regexp.MustCompile(`#SBATCH\s+--output\s*=\s*([^\s]+)`),
}

// 错误文件
p.patterns["error"] = []*regexp.Regexp{
regexp.MustCompile(`#SBATCH\s+--error[=\s]+([^\s]+)`),
regexp.MustCompile(`#SBATCH\s+-e\s+([^\s]+)`),
regexp.MustCompile(`#SBATCH\s+--error\s*=\s*([^\s]+)`),
}

// 服务质量
p.patterns["qos"] = []*regexp.Regexp{
regexp.MustCompile(`#SBATCH\s+--qos[=\s]+([^\s]+)`),
regexp.MustCompile(`#SBATCH\s+--qos\s*=\s*([^\s]+)`),
}

// 账户
p.patterns["account"] = []*regexp.Regexp{
regexp.MustCompile(`#SBATCH\s+--account[=\s]+([^\s]+)`),
regexp.MustCompile(`#SBATCH\s+-A\s+([^\s]+)`),
regexp.MustCompile(`#SBATCH\s+--account\s*=\s*([^\s]+)`),
}

// GPU相关
p.patterns["gpus"] = []*regexp.Regexp{
regexp.MustCompile(`#SBATCH\s+--gpus[=\s]+([^\s]+)`),
regexp.MustCompile(`#SBATCH\s+--gpus-per-node[=\s]+([^\s]+)`),
regexp.MustCompile(`#SBATCH\s+--gpus-per-task[=\s]+([^\s]+)`),
regexp.MustCompile(`#SBATCH\s+--gres[=\s]+gpu:(\d+)`),
regexp.MustCompile(`#SBATCH\s+--gres[=\s]+gpu:([^:]+):(\d+)`), // gpu类型:数量
}

// 约束条件
p.patterns["constraint"] = []*regexp.Regexp{
regexp.MustCompile(`#SBATCH\s+--constraint[=\s]+([^\s]+)`),
regexp.MustCompile(`#SBATCH\s+-C\s+([^\s]+)`),
regexp.MustCompile(`#SBATCH\s+--constraint\s*=\s*"([^"]+)"`),
}

// 独占节点
p.patterns["exclusive"] = []*regexp.Regexp{
regexp.MustCompile(`#SBATCH\s+--exclusive`),
}

// 数组作业
p.patterns["array"] = []*regexp.Regexp{
regexp.MustCompile(`#SBATCH\s+--array[=\s]+([^\s]+)`),
regexp.MustCompile(`#SBATCH\s+-a\s+([^\s]+)`),
}

// 工作目录
p.patterns["workdir"] = []*regexp.Regexp{
regexp.MustCompile(`#SBATCH\s+--chdir[=\s]+([^\s]+)`),
regexp.MustCompile(`#SBATCH\s+--workdir[=\s]+([^\s]+)`),
regexp.MustCompile(`#SBATCH\s+-D\s+([^\s]+)`),
}

// 邮件通知
p.patterns["mail_type"] = []*regexp.Regexp{
regexp.MustCompile(`#SBATCH\s+--mail-type[=\s]+([^\s]+)`),
}

p.patterns["mail_user"] = []*regexp.Regexp{
regexp.MustCompile(`#SBATCH\s+--mail-user[=\s]+([^\s]+)`),
}
}

// ParseScript 解析SLURM脚本
func (p *SlurmParser) ParseScript(scriptContent string) *SlurmResource {
resource := &SlurmResource{}

scanner := bufio.NewScanner(strings.NewReader(scriptContent))

for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())

// 跳过非SBATCH行和注释行
if !strings.HasPrefix(line, "#SBATCH") {
continue
}

// 处理每个字段
p.parseField(line, "job_name", &resource.JobName)
p.parseField(line, "cpus_per_task", &resource.CPUsPerTask)
p.parseField(line, "memory", &resource.Memory)
p.parseField(line, "nodes", &resource.Nodes)
p.parseField(line, "tasks", &resource.Tasks)
p.parseField(line, "tasks_per_node", &resource.TasksPerNode)
p.parseField(line, "partition", &resource.Partition)
p.parseField(line, "time", &resource.Time)
p.parseField(line, "output", &resource.Output)
p.parseField(line, "error", &resource.Error)
p.parseField(line, "qos", &resource.QOS)
p.parseField(line, "account", &resource.Account)
p.parseField(line, "constraint", &resource.Constraint)
p.parseField(line, "array", &resource.ArrayJobID)
p.parseField(line, "workdir", &resource.WorkingDir)
p.parseField(line, "mail_type", &resource.MailType)
p.parseField(line, "mail_user", &resource.MailUser)

// 处理GPU
p.parseGPU(line, resource)

// 处理exclusive
if p.matchPattern(line, "exclusive") != "" {
resource.Exclusive = true
}
}

// 后处理:推导缺失的信息
p.postProcess(resource)

return resource
}

// parseField 解析单个字段
func (p *SlurmParser) parseField(line, field string, target *string) {
if *target == "" { // 只在字段为空时才设置
if value := p.matchPattern(line, field); value != "" {
*target = value
}
}
}

// parseGPU 解析GPU相关信息
func (p *SlurmParser) parseGPU(line string, resource *SlurmResource) {
if patterns, exists := p.patterns["gpus"]; exists {
for _, pattern := range patterns {
if matches := pattern.FindStringSubmatch(line); len(matches) > 1 {
if strings.Contains(pattern.String(), "gres.*gpu:([^:]+):(\\d+)") && len(matches) > 2 {
// gpu类型:数量格式
resource.GPUType = matches[1]
resource.GPUs = matches[2]
} else {
resource.GPUs = matches[1]
}
break
}
}
}
}

// matchPattern 匹配模式并返回值
func (p *SlurmParser) matchPattern(line, field string) string {
if patterns, exists := p.patterns[field]; exists {
for _, pattern := range patterns {
if matches := pattern.FindStringSubmatch(line); len(matches) > 1 {
return matches[1]
}
}
}
return ""
}

// postProcess 后处理,推导缺失信息
func (p *SlurmParser) postProcess(resource *SlurmResource) {
// 如果没有指定CPUs但有tasks和cpus_per_task,计算总CPU数
if resource.CPUs == "" && resource.Tasks != "" && resource.CPUsPerTask != "" {
if tasks, err1 := strconv.Atoi(resource.Tasks); err1 == nil {
if cpusPerTask, err2 := strconv.Atoi(resource.CPUsPerTask); err2 == nil {
resource.CPUs = strconv.Itoa(tasks * cpusPerTask)
}
}
}

// 如果只有tasks但没有nodes,假设为单节点
if resource.Tasks != "" && resource.Nodes == "" && resource.TasksPerNode == "" {
resource.Nodes = "1"
}
}

// ParseFile 从文件解析SLURM脚本
func (p *SlurmParser) ParseFile(filename string) (*SlurmResource, error) {
content, err := os.ReadFile(filename)
if err != nil {
return nil, fmt.Errorf("读取文件失败: %v", err)
}
return p.ParseScript(string(content)), nil
}

// String 格式化输出
func (r *SlurmResource) String() string {
var result strings.Builder
result.WriteString("SLURM资源规格:\n")
result.WriteString("====================\n")

if r.JobName != "" {
result.WriteString(fmt.Sprintf("作业名称: %s\n", r.JobName))
}
if r.Partition != "" {
result.WriteString(fmt.Sprintf("队列/分区: %s\n", r.Partition))
}
if r.Nodes != "" {
result.WriteString(fmt.Sprintf("节点数: %s\n", r.Nodes))
}
if r.Tasks != "" {
result.WriteString(fmt.Sprintf("任务数: %s\n", r.Tasks))
}
if r.TasksPerNode != "" {
result.WriteString(fmt.Sprintf("每节点任务数: %s\n", r.TasksPerNode))
}
if r.CPUsPerTask != "" {
result.WriteString(fmt.Sprintf("每任务CPU数: %s\n", r.CPUsPerTask))
}
if r.CPUs != "" {
result.WriteString(fmt.Sprintf("总CPU数: %s\n", r.CPUs))
}
if r.Memory != "" {
result.WriteString(fmt.Sprintf("内存: %s\n", r.Memory))
}
if r.GPUs != "" {
result.WriteString(fmt.Sprintf("GPU数量: %s\n", r.GPUs))
if r.GPUType != "" {
result.WriteString(fmt.Sprintf("GPU类型: %s\n", r.GPUType))
}
}
if r.Time != "" {
result.WriteString(fmt.Sprintf("运行时间: %s\n", r.Time))
}
if r.Account != "" {
result.WriteString(fmt.Sprintf("账户: %s\n", r.Account))
}
if r.QOS != "" {
result.WriteString(fmt.Sprintf("服务质量: %s\n", r.QOS))
}
if r.Constraint != "" {
result.WriteString(fmt.Sprintf("节点约束: %s\n", r.Constraint))
}
if r.Exclusive {
result.WriteString("独占节点: 是\n")
}
if r.ArrayJobID != "" {
result.WriteString(fmt.Sprintf("数组作业: %s\n", r.ArrayJobID))
}
if r.Output != "" {
result.WriteString(fmt.Sprintf("输出文件: %s\n", r.Output))
}
if r.Error != "" {
result.WriteString(fmt.Sprintf("错误文件: %s\n", r.Error))
}

return result.String()
}

// GetResourceSummary 获取核心资源摘要
func (r *SlurmResource) GetResourceSummary() map[string]string {
summary := make(map[string]string)

if r.JobName != "" {
summary["job_name"] = r.JobName
}
if r.Partition != "" {
summary["queue"] = r.Partition
}
if r.Nodes != "" {
summary["nodes"] = r.Nodes
}
if r.Tasks != "" {
summary["tasks"] = r.Tasks
}
if r.CPUs != "" {
summary["total_cpus"] = r.CPUs
}
if r.CPUsPerTask != "" {
summary["cpus_per_task"] = r.CPUsPerTask
}
if r.Memory != "" {
summary["memory"] = r.Memory
}
if r.GPUs != "" {
summary["gpus"] = r.GPUs
}
if r.Time != "" {
summary["time_limit"] = r.Time
}

return summary
}

Loading…
Cancel
Save