|
- package storeLink
-
- import (
- "context"
- "errors"
- "fmt"
- "gitlink.org.cn/JointCloud/pcm-ac/hpcAC"
- hpcacclient "gitlink.org.cn/JointCloud/pcm-ac/hpcacclient"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
- "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
- "strconv"
- "strings"
- )
-
- type ShuguangHpc struct {
- aCRpc hpcacclient.HpcAC
- platform string
- participantId int64
- }
-
- const (
- GAP_WALL_TIME_24H = "24:00:00"
- TASK_SHUGUANG_PREFIX = "ShuguangHPC"
- NEWLINE = "\n"
- JOBNAME = "JOBNAME"
- GAP_CMD_FILE = "cmd"
- GAP_NNODE = "1" // 节点个数
- GAP_NODE_STRING = ""
- GAP_APPNAME = "BASE"
- GAP_QUEUE = "wzhdtest"
- GAP_WORK_DIR = "/work/home/acgnnmfbwo/BASE/JOBNAME"
- GAP_STD_OUT_FILE = "/work/home/acgnnmfbwo/BASE/JOBNAME/std.out.%j"
- GAP_STD_ERR_FILE = "/work/home/acgnnmfbwo/BASE/JOBNAME/std.err.%j"
- StrJobManagerID = 1637920656
- Apptype = "BASIC"
- EXPORT = "export"
- GAP_NPROC = "1"
- GAP_NDCU = "1"
- GAP_EXCLUSIVE = ""
- GAP_PPN = ""
- GAP_NGPU = ""
- GAP_MULTI_SUB = ""
- StrJobInfoMap = "%d,%s:%s:"
- Username = "acgnnmfbwo"
- )
-
- var RESOURCEMAP = map[string]ResourceSpecHpc{
- "FPOqD5Cx8iNYqawEgDrAxLdrszp4Tmhl": {
- GAP_NNODE: "1",
- GAP_NPROC: "1",
- GAP_NDCU: "1",
- },
- "Nd99eGNoBFC2ZTycKDlqD37heWTOmrMS": {
- GAP_NNODE: "1",
- GAP_NPROC: "2",
- GAP_NDCU: "1",
- },
- "uAmLkz6jgSZkC6o8JywG7Yo2aiFPPOBO": {
- GAP_NNODE: "1",
- GAP_NPROC: "4",
- GAP_NDCU: "2",
- },
- "D71OZQYrRabJc2nfL2GDWOdLEfbiMzYH": {
- GAP_NNODE: "1",
- GAP_NPROC: "8",
- GAP_NDCU: "4",
- },
- "sXUMrGmgMDFJaLi6dPiB9LkHjFb3lvL5": {
- GAP_NNODE: "1",
- GAP_NPROC: "16",
- GAP_NDCU: "4",
- },
- "ZfCKQKbNbQl9RPwlSyWLah1Gf7Ti7uJA": {
- GAP_NNODE: "1",
- GAP_NPROC: "32",
- GAP_NDCU: "4",
- },
- "cfEI4ulTNo2gYUozzdG59URByUjwLl3x": {
- GAP_NNODE: "2",
- GAP_NPROC: "4",
- GAP_NDCU: "2",
- },
- "vtbkaks8bErhpLRkUDiPDUHq6ssotFpD": {
- GAP_NNODE: "2",
- GAP_NPROC: "8",
- GAP_NDCU: "4",
- },
- "QJXZFJSReVWWQfkvQjGyEq1JpDHN55Oh": {
- GAP_NNODE: "2",
- GAP_NPROC: "16",
- GAP_NDCU: "4",
- },
- "79xSdy48yLbVLl9DqEV6tQ2J6jaHe5KO": {
- GAP_NNODE: "2",
- GAP_NPROC: "32",
- GAP_NDCU: "8",
- },
- }
-
- var RESOURCESPECSHPC = map[string]string{
- "FPOqD5Cx8iNYqawEgDrAxLdrszp4Tmhl": "1*NODE, CPU:1, 1*DCU",
- "Nd99eGNoBFC2ZTycKDlqD37heWTOmrMS": "1*NODE, CPU:2, 1*DCU",
- "uAmLkz6jgSZkC6o8JywG7Yo2aiFPPOBO": "1*NODE, CPU:4, 2*DCU",
- "D71OZQYrRabJc2nfL2GDWOdLEfbiMzYH": "1*NODE, CPU:8, 4*DCU",
- "sXUMrGmgMDFJaLi6dPiB9LkHjFb3lvL5": "1*NODE, CPU:16, 4*DCU",
- "ZfCKQKbNbQl9RPwlSyWLah1Gf7Ti7uJA": "1*NODE, CPU:32, 4*DCU",
- "cfEI4ulTNo2gYUozzdG59URByUjwLl3x": "2*NODE, CPU:4, 2*DCU",
- "vtbkaks8bErhpLRkUDiPDUHq6ssotFpD": "2*NODE, CPU:8, 4*DCU",
- "QJXZFJSReVWWQfkvQjGyEq1JpDHN55Oh": "2*NODE, CPU:16, 4*DCU",
- "79xSdy48yLbVLl9DqEV6tQ2J6jaHe5KO": "2*NODE, CPU:32, 8*DCU",
- }
-
- var AcStatus = map[string]string{
- "statQ": "Pending",
- "statR": "Running",
- "statE": "Pending",
- "statC": "Completed",
- "statH": "Pending",
- "statS": "Pending",
- "statW": "Pending",
- "statX": "Other",
- }
-
- type ResourceSpecHpc struct {
- GAP_NNODE string
- GAP_NPROC string
- GAP_NDCU string
- }
-
- func NewShuguangHpc(aCRpc hpcacclient.HpcAC, name string, id int64) *ShuguangHpc {
- return &ShuguangHpc{aCRpc: aCRpc, platform: name, participantId: id}
- }
-
- func (s ShuguangHpc) UploadImage(ctx context.Context, path string) (interface{}, error) {
- return nil, nil
- }
-
- func (s ShuguangHpc) DeleteImage(ctx context.Context, imageId string) (interface{}, error) {
- return nil, nil
- }
-
- func (s ShuguangHpc) QueryImageList(ctx context.Context) (interface{}, error) {
- return nil, nil
- }
-
- func (s ShuguangHpc) SubmitTask(ctx context.Context, imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string, aiType string) (interface{}, error) {
- // shuguangHpc提交任务
-
- //判断是否resourceId匹配自定义资源Id
- _, isMapContainsKey := RESOURCESPECSHPC[resourceId]
- if !isMapContainsKey {
- return nil, errors.New("shuguangHpc资源Id不存在")
- }
-
- //环境变量
- var env string
- for _, e := range envs {
- s := strings.Split(e, COMMA)
- env += EXPORT + SPACE + s[0] + EQUAL + s[1] + NEWLINE
- }
-
- //请求
- taskName := TASK_SHUGUANG_PREFIX + UNDERSCORE + utils.RandomString(10)
- GAP_WORK_DIR := strings.Replace(GAP_WORK_DIR, JOBNAME, taskName, -1)
- GAP_STD_OUT_FILE := strings.Replace(GAP_STD_OUT_FILE, JOBNAME, taskName, -1)
- GAP_STD_ERR_FILE := strings.Replace(GAP_STD_ERR_FILE, JOBNAME, taskName, -1)
-
- req := &hpcAC.SubmitJobReq{
- Apptype: Apptype,
- Appname: GAP_APPNAME,
- StrJobManagerID: StrJobManagerID,
- MapAppJobInfo: &hpcAC.MapAppJobInfo{
- GAP_CMD_FILE: cmd,
- GAP_NNODE: GAP_NNODE,
- GAP_NODE_STRING: GAP_NODE_STRING,
- GAP_SUBMIT_TYPE: GAP_CMD_FILE,
- GAP_JOB_NAME: taskName,
- GAP_WORK_DIR: GAP_WORK_DIR,
- GAP_QUEUE: GAP_QUEUE,
- GAP_NPROC: GAP_NPROC,
- GAP_PPN: GAP_PPN,
- GAP_NGPU: GAP_NGPU,
- GAP_NDCU: GAP_NDCU,
- GAP_WALL_TIME: GAP_WALL_TIME_24H,
- GAP_EXCLUSIVE: GAP_EXCLUSIVE,
- GAP_APPNAME: GAP_APPNAME,
- GAP_MULTI_SUB: GAP_MULTI_SUB,
- GAP_STD_OUT_FILE: GAP_STD_OUT_FILE,
- GAP_STD_ERR_FILE: GAP_STD_ERR_FILE,
- GAP_SCHEDULER_OPT_WEB: env,
- },
- }
-
- updateSGHpcRequestByResourceId(resourceId, req)
-
- resp, err := s.aCRpc.SubmitJob(ctx, req)
- if err != nil {
- return nil, err
- }
-
- return resp, nil
-
- }
-
- func (s ShuguangHpc) QueryTask(ctx context.Context, taskId string) (interface{}, error) {
- //实时作业
- reqC := &hpcAC.JobDetailReq{
- JobId: taskId,
- }
- respC, err := s.aCRpc.GetJobDetail(ctx, reqC)
- if err != nil {
- return nil, err
- }
-
- //实时作业检查是否成功
- if respC.Data != nil && respC.Data.JobEndTime != "" {
- return respC, nil
- } else {
- //历史作业
- reqH := &hpcAC.HistoryJobDetailReq{
- JobId: taskId,
- JobmanagerId: strconv.Itoa(StrJobManagerID),
- }
-
- respH, err := s.aCRpc.HistoryJobDetail(ctx, reqH)
- if err != nil {
- return nil, err
- }
-
- return respH, nil
- }
- }
-
- func (s ShuguangHpc) QuerySpecs(ctx context.Context) (interface{}, error) {
- resp := &types.GetResourceSpecsResp{}
-
- for k, v := range RESOURCESPECSHPC {
- var respec types.ResourceSpecSl
- respec.SpecId = k
- respec.SpecName = v
- respec.ParticipantId = s.participantId
- respec.ParticipantName = s.platform
- resp.ResourceSpecs = append(resp.ResourceSpecs, &respec)
- }
-
- resp.Success = true
- return resp, nil
- }
-
- func (s ShuguangHpc) DeleteTask(ctx context.Context, taskId string) (interface{}, error) {
- strJobInfoMap := fmt.Sprintf(StrJobInfoMap, StrJobManagerID, Username, taskId)
- req := &hpcAC.DeleteJobReq{
- StrJobInfoMap: strJobInfoMap,
- }
- resp, err := s.aCRpc.DeleteJob(ctx, req)
- if err != nil {
- return nil, err
- }
-
- return resp, nil
- }
-
- func updateSGHpcRequestByResourceId(resourceId string, req *hpcAC.SubmitJobReq) {
- spec := RESOURCEMAP[resourceId]
- req.MapAppJobInfo.GAP_NNODE = spec.GAP_NNODE
- req.MapAppJobInfo.GAP_NPROC = spec.GAP_NPROC
- req.MapAppJobInfo.GAP_NDCU = spec.GAP_NDCU
- }
|