|
|
|
@@ -5,6 +5,8 @@ import ( |
|
|
|
"context" |
|
|
|
"encoding/json" |
|
|
|
"errors" |
|
|
|
"fmt" |
|
|
|
common2 "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/common" |
|
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/entity" |
|
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option" |
|
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector" |
|
|
|
@@ -15,6 +17,7 @@ import ( |
|
|
|
"gitlink.org.cn/JointCloud/pcm-openi/common" |
|
|
|
"mime/multipart" |
|
|
|
"net/http" |
|
|
|
"strconv" |
|
|
|
"strings" |
|
|
|
) |
|
|
|
|
|
|
|
@@ -28,6 +31,24 @@ const ( |
|
|
|
TASK_NAME_PREFIX = "trainJob" |
|
|
|
Python = "python " |
|
|
|
SemiColon = ";" |
|
|
|
BALANCE = "balance" |
|
|
|
RATE = "rate" |
|
|
|
PERHOUR = "per-hour" |
|
|
|
NUMBER = "number" |
|
|
|
KILOBYTE = "kb" |
|
|
|
GIGABYTE = "gb" |
|
|
|
CPUCORE = "core" |
|
|
|
STORAGE = "STORAGE" |
|
|
|
DISK = "disk" |
|
|
|
MEMORY = "memory" |
|
|
|
RAM = "ram" |
|
|
|
VRAM = "vram" |
|
|
|
RMB = "rmb" |
|
|
|
POINT = "point" |
|
|
|
RUNNINGTASK = "RUNNING_TASK" |
|
|
|
RUNNING = "RUNNING" |
|
|
|
CPU = "cpu" |
|
|
|
Gi = "Gi" |
|
|
|
) |
|
|
|
|
|
|
|
const ( |
|
|
|
@@ -71,9 +92,12 @@ type OctopusHttp struct { |
|
|
|
token *Token |
|
|
|
} |
|
|
|
|
|
|
|
func NewOctopusHttp(id int64, name, server, host string, user string, pwd string) *OctopusHttp { |
|
|
|
token, _ := NewToken(host, user, pwd) |
|
|
|
return &OctopusHttp{platform: name, participantId: id, server: server, host: host, token: token} |
|
|
|
func NewOctopusHttp(id int64, name, server, host string, user string, pwd string) (*OctopusHttp, error) { |
|
|
|
token, err := NewToken(server, host, user, pwd) |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
return &OctopusHttp{platform: name, participantId: id, server: server, host: host, token: token}, nil |
|
|
|
} |
|
|
|
|
|
|
|
// executor |
|
|
|
@@ -291,50 +315,50 @@ func (o *OctopusHttp) GetUserBalance(ctx context.Context) (float64, error) { |
|
|
|
} |
|
|
|
|
|
|
|
func (o *OctopusHttp) GetResourceSpecs(ctx context.Context, resrcType string) (*collector.ResourceSpec, error) { |
|
|
|
//resp, err := o.resourceSpecs(ctx) |
|
|
|
//if err != nil { |
|
|
|
// return nil, err |
|
|
|
//} |
|
|
|
// |
|
|
|
//res := &collector.ResourceSpec{ |
|
|
|
// ClusterId: strconv.FormatInt(o.participantId, 10), |
|
|
|
// Tag: resrcType, |
|
|
|
//} |
|
|
|
// |
|
|
|
//if resp.Code != http.StatusOK { |
|
|
|
// if resp.Data != nil { |
|
|
|
// marshal, err := json.Marshal(resp.Data) |
|
|
|
// if err != nil { |
|
|
|
// return nil, err |
|
|
|
// } |
|
|
|
// |
|
|
|
// errormdl := &omodel.Error{} |
|
|
|
// err = json.Unmarshal(marshal, errormdl) |
|
|
|
// if err != nil { |
|
|
|
// return nil, err |
|
|
|
// } |
|
|
|
// return nil, errors.New(errormdl.Message) |
|
|
|
// } |
|
|
|
//} else { |
|
|
|
// if resp.Data != nil { |
|
|
|
// specs := &entity.OctResourceSpecs{} |
|
|
|
// marshal, err := json.Marshal(resp.Data) |
|
|
|
// if err != nil { |
|
|
|
// return nil, err |
|
|
|
// } |
|
|
|
// err = json.Unmarshal(marshal, specs) |
|
|
|
// if err != nil { |
|
|
|
// return nil, err |
|
|
|
// } |
|
|
|
// clusterResources, err := genSpecs(specs, resrcType) |
|
|
|
// if err != nil { |
|
|
|
// return nil, err |
|
|
|
// } |
|
|
|
// res.Resources = clusterResources |
|
|
|
// } |
|
|
|
//} |
|
|
|
resp, err := o.resourceSpecs(ctx) |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
|
|
|
|
return nil, nil |
|
|
|
res := &collector.ResourceSpec{ |
|
|
|
ClusterId: strconv.FormatInt(o.participantId, 10), |
|
|
|
Tag: resrcType, |
|
|
|
} |
|
|
|
|
|
|
|
if resp.Code != http.StatusOK { |
|
|
|
if resp.Data != nil { |
|
|
|
marshal, err := json.Marshal(resp.Data) |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
|
|
|
|
errormdl := &omodel.Error{} |
|
|
|
err = json.Unmarshal(marshal, errormdl) |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
return nil, errors.New(errormdl.Message) |
|
|
|
} |
|
|
|
} else { |
|
|
|
if resp.Data != nil { |
|
|
|
specs := &entity.OctResourceSpecs{} |
|
|
|
marshal, err := json.Marshal(resp.Data) |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
err = json.Unmarshal(marshal, specs) |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
clusterResources, err := genSpecs(specs, resrcType) |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
res.Resources = clusterResources |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
return res, nil |
|
|
|
} |
|
|
|
|
|
|
|
func genSpecs(specs *entity.OctResourceSpecs, resrcType string) ([]interface{}, error) { |
|
|
|
@@ -355,24 +379,207 @@ func genSpecs(specs *entity.OctResourceSpecs, resrcType string) ([]interface{}, |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
if spec.ResourceQuantity.BiV100 != "" { |
|
|
|
|
|
|
|
resType, err := chooseResourceType(spec) |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
//cres := &collector.ClusterResource{} |
|
|
|
//card := &collector.Usage{ |
|
|
|
// Type: ComputeSource[i], |
|
|
|
// Name: strings.ToUpper(k), |
|
|
|
// Total: &collector.UnitValue{Unit: spec.ResourceQuantity, Value: v.AccCardsNum}, |
|
|
|
// Available: &collector.UnitValue{Unit: NUMBER, Value: v.AccCardsNum}, |
|
|
|
//} |
|
|
|
//spec.ResourceQuantity. |
|
|
|
if resType == nil { |
|
|
|
continue |
|
|
|
} |
|
|
|
res = append(res, resType) |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
return res, nil |
|
|
|
} |
|
|
|
|
|
|
|
func chooseResourceType(spec *omodel.Spec) (*collector.ClusterResource, error) { |
|
|
|
if spec.ResourceQuantity.NvidiaA100 != "" { |
|
|
|
tag, err := common2.GetJSONTag(spec, "NvidiaA100") |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec) |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
return cres, nil |
|
|
|
} else if spec.ResourceQuantity.NvidiaA10080G != "" { |
|
|
|
tag, err := common2.GetJSONTag(spec, "NvidiaA100") |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec) |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
return cres, nil |
|
|
|
} else if spec.ResourceQuantity.MrV100 != "" { |
|
|
|
tag, err := common2.GetJSONTag(spec, "NvidiaA100") |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec) |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
return cres, nil |
|
|
|
} else if spec.ResourceQuantity.BiV100 != "" { |
|
|
|
tag, err := common2.GetJSONTag(spec, "NvidiaA100") |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec) |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
return cres, nil |
|
|
|
} else if spec.ResourceQuantity.MRV50 != "" { |
|
|
|
tag, err := common2.GetJSONTag(spec, "NvidiaA100") |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec) |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
return cres, nil |
|
|
|
} else if spec.ResourceQuantity.BIV100 != "" { |
|
|
|
tag, err := common2.GetJSONTag(spec, "NvidiaA100") |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec) |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
return cres, nil |
|
|
|
} else if spec.ResourceQuantity.BIV150 != "" { |
|
|
|
tag, err := common2.GetJSONTag(spec, "NvidiaA100") |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec) |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
return cres, nil |
|
|
|
} else if spec.ResourceQuantity.MRV100 != "" { |
|
|
|
tag, err := common2.GetJSONTag(spec, "NvidiaA100") |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec) |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
return cres, nil |
|
|
|
} else if spec.ResourceQuantity.CambriconComMlu != "" { |
|
|
|
tag, err := common2.GetJSONTag(spec, "NvidiaA100") |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec) |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
return cres, nil |
|
|
|
} else if spec.ResourceQuantity.HygonComDcu != "" { |
|
|
|
tag, err := common2.GetJSONTag(spec, "NvidiaA100") |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec) |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
return cres, nil |
|
|
|
} else if spec.ResourceQuantity.HuaweiComAscend910 != "" { |
|
|
|
tag, err := common2.GetJSONTag(spec, "NvidiaA100") |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec) |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
return cres, nil |
|
|
|
} else if spec.ResourceQuantity.EnflameComGcu != "" { |
|
|
|
tag, err := common2.GetJSONTag(spec, "NvidiaA100") |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec) |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
return cres, nil |
|
|
|
} else if spec.ResourceQuantity.MXN260 != "" { |
|
|
|
tag, err := common2.GetJSONTag(spec, "NvidiaA100") |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec) |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
return cres, nil |
|
|
|
} |
|
|
|
|
|
|
|
return nil, nil |
|
|
|
} |
|
|
|
|
|
|
|
func genClusterResources(cType string, cNum string, s *omodel.Spec) (*collector.ClusterResource, error) { |
|
|
|
cres := &collector.ClusterResource{} |
|
|
|
bres := make([]*collector.Usage, 0) |
|
|
|
|
|
|
|
cardNum, err := strconv.ParseInt(cNum, 10, 64) |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
cpuCore, err := strconv.ParseInt(s.ResourceQuantity.Cpu, 10, 64) |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
gi := strings.Split(s.ResourceQuantity.Memory, Gi) |
|
|
|
if len(gi) != 1 { |
|
|
|
return nil, fmt.Errorf("s.ResourceQuantity.Memory convert error: %s", s.ResourceQuantity.Memory) |
|
|
|
} |
|
|
|
|
|
|
|
memGi, err := strconv.ParseInt(gi[0], 10, 64) |
|
|
|
if err != nil { |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
|
|
|
|
card := &collector.Usage{ |
|
|
|
Type: ComputeSourceToCardType[cType], |
|
|
|
Name: strings.ToUpper(cType), |
|
|
|
Total: &collector.UnitValue{Unit: NUMBER, Value: cardNum}, |
|
|
|
Available: &collector.UnitValue{Unit: NUMBER, Value: cardNum}, |
|
|
|
} |
|
|
|
cpu := &collector.Usage{ |
|
|
|
Type: strings.ToUpper(CPU), |
|
|
|
Name: strings.ToUpper(CPU), |
|
|
|
Total: &collector.UnitValue{Unit: CPUCORE, Value: cpuCore}, |
|
|
|
Available: &collector.UnitValue{Unit: CPUCORE, Value: cpuCore}, |
|
|
|
} |
|
|
|
mem := &collector.Usage{ |
|
|
|
Type: strings.ToUpper(MEMORY), |
|
|
|
Name: strings.ToUpper(RAM), |
|
|
|
Total: &collector.UnitValue{Unit: GIGABYTE, Value: memGi}, |
|
|
|
Available: &collector.UnitValue{Unit: GIGABYTE, Value: memGi}, |
|
|
|
} |
|
|
|
|
|
|
|
bres = append(bres, cpu) |
|
|
|
bres = append(bres, mem) |
|
|
|
|
|
|
|
cres.Resource = card |
|
|
|
cres.BaseResources = bres |
|
|
|
|
|
|
|
return cres, nil |
|
|
|
} |
|
|
|
|
|
|
|
// inference |
|
|
|
func (o *OctopusHttp) GetClusterInferUrl(ctx context.Context, option *option.InferOption) (*inference.ClusterInferUrl, error) { |
|
|
|
return nil, errors.New(NotImplementError) |
|
|
|
|