|
- package octopusHttp
-
- import (
- "bytes"
- "context"
- "encoding/json"
- "errors"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/entity"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/executor"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference"
- "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
- omodel "gitlink.org.cn/JointCloud/pcm-octopus/http/model"
- "gitlink.org.cn/JointCloud/pcm-openi/common"
- "mime/multipart"
- "net/http"
- "strconv"
- "strings"
- )
-
- const (
- RESOURCE_POOL = "common-pool"
- Param_Token = "token"
- Param_Addr = "addr"
- Forward_Slash = "/"
- COMMA = ","
- UNDERSCORE = "_"
- TASK_NAME_PREFIX = "trainJob"
- Python = "python "
- SemiColon = ";"
- )
-
- const (
- NotImplementError = "not implemented"
- )
-
- const (
- MyAlgorithmListUrl = "api/v1/algorithm/myAlgorithmList"
- ResourcespecsUrl = "api/v1/resource/specs"
- CreateTrainJobUrl = "api/v1/job/create"
- TrainJobDetail = "api/v1/job/detail"
- )
-
- // compute source
- var (
- ComputeSourceToCardType = map[string]string{
- "nvidia-a100": "GPU",
- "nvidia-a100-80g": "GPU",
- "mr-v100": "ILUVATAR-GPGPU",
- "bi-v100": "ILUVATAR-GPGPU",
- "MR-V50": "ILUVATAR-GPGPU",
- "BI-V100": "ILUVATAR-GPGPU",
- "BI-V150": "ILUVATAR-GPGPU",
- "MR-V100": "ILUVATAR-GPGPU",
-
- "cambricon.com/mlu": "MLU",
- "hygon.com/dcu": "DCU",
-
- "huawei.com/Ascend910": "NPU",
- "enflame.com/gcu": "GCU",
- "ILUVATAR-GPGPU": "ILUVATAR-GPGPU",
- "MXN260": "METAX-GPGPU",
- }
- )
-
- type OctopusHttp struct {
- server string
- host string
- platform string
- participantId int64
- token *Token
- }
-
- func NewOctopusHttp(id int64, name, server, host string, user string, pwd string) *OctopusHttp {
- token, _ := NewToken(host, user, pwd)
- return &OctopusHttp{platform: name, participantId: id, server: server, host: host, token: token}
- }
-
- // executor
- func (o *OctopusHttp) Execute(ctx context.Context, option *option.AiOption, mode int) (interface{}, error) {
- switch mode {
- case executor.SUBMIT_MODE_JOINT_CLOUD:
-
- case executor.SUBMIT_MODE_STORAGE_SCHEDULE:
- // cmd
- if option.AlgorithmId != "" {
- option.Cmd = option.Cmd + SemiColon + Python + option.AlgorithmId
- }
- option.ResourceId = "9e2feeae30e04492a4298755179f2ae0"
- task, err := o.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType)
- if err != nil {
- return nil, err
- }
- return task, nil
- }
- return nil, nil
- }
-
- func (o *OctopusHttp) Stop(ctx context.Context, id string) error {
- //TODO implement me
- panic("implement me")
- }
-
- func (o *OctopusHttp) SubmitTask(ctx context.Context, imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string, aiType string) (interface{}, error) {
-
- // octopus提交任务
- reqUrl := o.server + CreateTrainJobUrl
-
- token, err := o.token.Get()
- if err != nil {
- return nil, err
- }
-
- // python参数
- var prms []struct {
- Key string `json:"key"`
- Value string `json:"value"`
- }
- for _, param := range params {
- var p struct {
- Key string `json:"key"`
- Value string `json:"value"`
- }
- s := strings.Split(param, COMMA)
- p.Key = s[0]
- p.Value = s[1]
- prms = append(prms, p)
- }
-
- //环境变量
- envMap := make(map[string]string)
- for _, env := range envs {
- s := strings.Split(env, COMMA)
- envMap[s[0]] = s[1]
- }
-
- param := &omodel.CreateTrainJobParam{
- //DataSetId: datasetsId,
- //DataSetVersion: VERSION,
- //AlgorithmId: algorithmId,
- //AlgorithmVersion: VERSION,
- Name: TASK_NAME_PREFIX + UNDERSCORE + utils.RandomString(10),
- ImageId: imageId,
- IsDistributed: false,
- ResourcePool: RESOURCE_POOL,
- Config: []*omodel.CreateTrainJobConf{
- {
- Command: cmd,
- ResourceSpecId: resourceId,
- MinFailedTaskCount: 1,
- MinSucceededTaskCount: 1,
- TaskNumber: 1,
- Parameters: prms,
- Envs: envMap,
- },
- },
- }
-
- resp := &entity.OctCreateJobResp{}
-
- req := common.GetRestyRequest(common.TIMEOUT)
- _, err = req.
- SetHeader("Authorization", "Bearer "+token).
- SetBody(param).
- SetResult(resp).
- Post(reqUrl)
-
- if err != nil {
- return nil, err
- }
- return resp, nil
-
- }
-
- // collector
- func (o *OctopusHttp) resourceSpecs(ctx context.Context) (*entity.OctResourceSpecsResp, error) {
- resourcespecsUrl := o.server + ResourcespecsUrl
- token, err := o.token.Get()
- if err != nil {
- return nil, err
- }
-
- param := omodel.ResourceSpecParam{
- ResourcePool: RESOURCE_POOL,
- }
-
- b, _ := json.Marshal(param)
- byt := bytes.NewBuffer(b)
-
- resp := &entity.OctResourceSpecsResp{}
-
- req := common.GetRestyRequest(common.TIMEOUT)
- r, _ := http.NewRequest("GET", resourcespecsUrl, byt)
- req.RawRequest = r
- req.URL = resourcespecsUrl
-
- _, err = req.
- SetHeader("Content-Type", "application/json").
- SetQueryParam(Param_Token, token).
- SetQueryParam(Param_Addr, o.host).
- SetBody(byt).
- SetResult(resp).
- Send()
-
- if err != nil {
- return nil, err
- }
-
- return resp, nil
- }
-
- func (o *OctopusHttp) GetResourceStats(ctx context.Context) (*collector.ResourceStats, error) {
- resp, err := o.resourceSpecs(ctx)
- if err != nil {
- return nil, err
- }
- if resp.Code != http.StatusOK {
- if resp.Data != nil {
- marshal, err := json.Marshal(resp.Data)
- if err != nil {
- return nil, err
- }
-
- errormdl := &omodel.Error{}
- err = json.Unmarshal(marshal, errormdl)
- if err != nil {
- return nil, err
- }
- return nil, errors.New(errormdl.Message)
- }
- } else {
- if resp.Data != nil {
- spec := &entity.OctResourceSpecs{}
- marshal, err := json.Marshal(resp.Data)
- if err != nil {
- return nil, err
- }
- err = json.Unmarshal(marshal, spec)
- if err != nil {
- return nil, err
- }
- }
- }
-
- return nil, nil
- }
-
- func (o *OctopusHttp) GetDatasetsSpecs(ctx context.Context) ([]*collector.DatasetsSpecs, error) {
- return nil, nil
- }
-
- func (o *OctopusHttp) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm, error) {
- //TODO implement me
- panic("implement me")
- }
-
- func (o *OctopusHttp) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) {
- //TODO implement me
- panic("implement me")
- }
-
- func (o *OctopusHttp) GetTrainingTask(ctx context.Context, taskId string) (*collector.Task, error) {
- //TODO implement me
- panic("implement me")
- }
-
- func (o *OctopusHttp) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) {
- //TODO implement me
- panic("implement me")
- }
-
- func (o *OctopusHttp) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error {
- //TODO implement me
- panic("implement me")
- }
-
- func (o OctopusHttp) GetComputeCards(ctx context.Context) ([]string, error) {
- //TODO implement me
- panic("implement me")
- }
-
- func (o *OctopusHttp) GetUserBalance(ctx context.Context) (float64, error) {
- //TODO implement me
- panic("implement me")
- }
-
- func (o *OctopusHttp) GetResourceSpecs(ctx context.Context, resrcType string) (*collector.ResourceSpec, error) {
- resp, err := o.resourceSpecs(ctx)
- if err != nil {
- return nil, err
- }
-
- res := &collector.ResourceSpec{
- ClusterId: strconv.FormatInt(o.participantId, 10),
- Tag: resrcType,
- }
-
- if resp.Code != http.StatusOK {
- if resp.Data != nil {
- marshal, err := json.Marshal(resp.Data)
- if err != nil {
- return nil, err
- }
-
- errormdl := &omodel.Error{}
- err = json.Unmarshal(marshal, errormdl)
- if err != nil {
- return nil, err
- }
- return nil, errors.New(errormdl.Message)
- }
- } else {
- if resp.Data != nil {
- specs := &entity.OctResourceSpecs{}
- marshal, err := json.Marshal(resp.Data)
- if err != nil {
- return nil, err
- }
- err = json.Unmarshal(marshal, specs)
- if err != nil {
- return nil, err
- }
- clusterResources, err := genSpecs(specs, resrcType)
- if err != nil {
- return nil, err
- }
- res.Resources = clusterResources
- }
- }
-
- return res, nil
- }
-
- func genSpecs(specs *entity.OctResourceSpecs, resrcType string) ([]interface{}, error) {
- res := make([]interface{}, 0)
- if resrcType == "Inference" {
- return res, nil
- } else if resrcType == "Train" {
- if specs.MapResourceSpecIdList.Train.ResourceSpecs == nil {
- return res, nil
- } else {
- for _, s := range specs.MapResourceSpecIdList.Train.ResourceSpecs {
- spec := &omodel.Spec{}
- marshal, err := json.Marshal(s)
- if err != nil {
- return nil, err
- }
- err = json.Unmarshal(marshal, specs)
- if err != nil {
- return nil, err
- }
- if spec.ResourceQuantity.BiV100 != "" {
-
- }
- //cres := &collector.ClusterResource{}
- //card := &collector.Usage{
- // Type: ComputeSource[i],
- // Name: strings.ToUpper(k),
- // Total: &collector.UnitValue{Unit: spec.ResourceQuantity, Value: v.AccCardsNum},
- // Available: &collector.UnitValue{Unit: NUMBER, Value: v.AccCardsNum},
- //}
- //spec.ResourceQuantity.
- }
- }
- }
-
- return nil, nil
- }
-
- // inference
- func (o *OctopusHttp) GetClusterInferUrl(ctx context.Context, option *option.InferOption) (*inference.ClusterInferUrl, error) {
- return nil, errors.New(NotImplementError)
- }
-
- func (o *OctopusHttp) GetInferDeployInstanceList(ctx context.Context) ([]*inference.DeployInstance, error) {
- return nil, errors.New(NotImplementError)
- }
-
- func (o *OctopusHttp) StartInferDeployInstance(ctx context.Context, id string) bool {
- return false
- }
-
- func (o *OctopusHttp) StopInferDeployInstance(ctx context.Context, id string) bool {
- return false
- }
-
- func (o *OctopusHttp) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) {
- return nil, errors.New(NotImplementError)
- }
-
- func (o *OctopusHttp) CreateInferDeployInstance(ctx context.Context, option *option.InferOption) (string, error) {
- return "", errors.New(NotImplementError)
- }
-
- func (o *OctopusHttp) CheckModelExistence(ctx context.Context, modelName string, modelType string) bool {
- return false
- }
-
- func (o *OctopusHttp) GetImageInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) {
- return "", errors.New(NotImplementError)
- }
|