/* Copyright (c) [2023] [pcm] [pcm-coordinator] is licensed under Mulan PSL v2. You can use this software according to the terms and conditions of the Mulan PSL v2. You may obtain a copy of Mulan PSL v2 at: http://license.coscl.org.cn/MulanPSL2 THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPaRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. See the Mulan PSL v2 for more details. */ package storeLink import ( "bufio" "context" "errors" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" "gitlink.org.cn/JointCloud/pcm-octopus/octopus" "gitlink.org.cn/JointCloud/pcm-octopus/octopusclient" "io" "math" "mime/multipart" "strconv" "strings" "time" ) type OctopusLink struct { octopusRpc octopusclient.Octopus pageIndex int32 pageSize int32 platform string participantId int64 } const ( IMG_NAME_PREFIX = "oct_" IMG_VERSION_PREFIX = "version_" TASK_NAME_PREFIX = "trainJob" RESOURCE_POOL = "common-pool" HANWUJI = "hanwuji" SUIYUAN = "suiyuan" SAILINGSI = "sailingsi" MLU = "MLU" BIV100 = "BI-V100" CAMBRICONMLU290 = 256 GCU = "GCU" ENFLAME = "enflame" EnflameT20 = 128 BASE_TOPS = 128 CAMBRICON = "cambricon" ILUVATAR = "iluvatar" TRAIN_CMD = "cd /code; python train.py" VERSION = "V1" DOMAIN = "http://192.168.242.41:8001/" CAMBRICON_CN = "寒武纪290" ENFLAME_CN = "燧原T20" ILUVATAR_CN = "天数BI-V100" ) var ( cardAliasMap = map[string]string{ MLU: CAMBRICON, GCU: ENFLAME, BIV100: ILUVATAR, } cardCnMap = map[string]string{ MLU: CAMBRICON_CN, GCU: ENFLAME_CN, BIV100: ILUVATAR_CN, } cardTopsMap = map[string]float64{ MLU: CAMBRICONMLU290, GCU: EnflameT20, } CardModelNameCmdMap = map[string]map[string]string{ BIV100: {"blip-image-captioning-base": "pip install -U transformers; pip install fastapi uvicorn[standard]; pip install python-multipart; cd /code; python infer_biv100.py", "imagenet_resnet50": "pip install -U transformers; pip install fastapi uvicorn[standard]; pip install python-multipart; cd /code/infer; python infer_biv100.py", "ChatGLM-6B": "su root; pip install transformers==4.33.2; pip install fastapi uvicorn[standard]; cd /code; python infer_biv100.py"}, MLU: {"blip-image-captioning-base": "", "imagenet_resnet50": "su root; . /torch/venv3/pytorch/bin/activate; pip install fastapi uvicorn[standard]; pip install python-multipart; cd /code/infer; python infer_mlu.py"}, } ) func NewOctopusLink(octopusRpc octopusclient.Octopus, name string, id int64) *OctopusLink { return &OctopusLink{octopusRpc: octopusRpc, platform: name, participantId: id, pageIndex: 1, pageSize: 100} } func (o *OctopusLink) UploadImage(ctx context.Context, path string) (interface{}, error) { // octopus创建镜像 createReq := &octopus.CreateImageReq{ Platform: o.platform, CreateImage: &octopus.CreateImage{ SourceType: 1, ImageName: IMG_NAME_PREFIX + utils.RandomString(7), ImageVersion: IMG_VERSION_PREFIX + utils.RandomString(7), }, } createResp, err := o.octopusRpc.CreateImage(ctx, createReq) if err != nil { return nil, err } // octopus上传镜像 uploadReq := &octopus.UploadImageReq{ Platform: o.platform, ImageId: createResp.Payload.ImageId, Params: &octopus.UploadImageParam{ Domain: "", FileName: "", }, } uploadResp, err := o.octopusRpc.UploadImage(ctx, uploadReq) if err != nil { return nil, err } // Todo 实际上传 return uploadResp, nil } func (o *OctopusLink) DeleteImage(ctx context.Context, imageId string) (interface{}, error) { // octopus删除镜像 req := &octopus.DeleteImageReq{ Platform: o.platform, ImageId: imageId, } resp, err := o.octopusRpc.DeleteImage(ctx, req) if err != nil { return nil, err } return resp, nil } func (o *OctopusLink) QueryImageList(ctx context.Context) (interface{}, error) { // octopus获取镜像列表 req := &octopus.GetUserImageListReq{ Platform: o.platform, PageIndex: o.pageIndex, PageSize: o.pageSize, } resp, err := o.octopusRpc.GetUserImageList(ctx, req) if err != nil { return nil, err } return resp, nil } func (o *OctopusLink) SubmitTask(ctx context.Context, imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string, aiType string) (interface{}, error) { // octopus提交任务 // python参数 var prms []*octopus.Parameters for _, param := range params { var p octopus.Parameters s := strings.Split(param, COMMA) p.Key = s[0] p.Value = s[1] prms = append(prms, &p) } //环境变量 envMap := make(map[string]string) for _, env := range envs { s := strings.Split(env, COMMA) envMap[s[0]] = s[1] } req := &octopus.CreateTrainJobReq{ Platform: o.platform, Params: &octopus.CreateTrainJobParam{ ImageId: imageId, Name: TASK_NAME_PREFIX + UNDERSCORE + utils.RandomString(10), ResourcePool: RESOURCE_POOL, Config: []*octopus.Config{ { Command: cmd, ResourceSpecId: resourceId, MinFailedTaskCount: 1, MinSucceededTaskCount: 1, TaskNumber: 1, Parameters: prms, Envs: envMap, }, }, DataSetId: datasetsId, DataSetVersion: VERSION, AlgorithmId: algorithmId, AlgorithmVersion: VERSION, }, } resp, err := o.octopusRpc.CreateTrainJob(ctx, req) if err != nil { return nil, err } return resp, nil } func (o *OctopusLink) QueryTask(ctx context.Context, taskId string) (interface{}, error) { // octopus获取任务 req := &octopus.GetTrainJobReq{ Platform: o.platform, Id: taskId, } resp, err := o.octopusRpc.GetTrainJob(ctx, req) if err != nil { return nil, err } return resp, nil } func (o *OctopusLink) DeleteTask(ctx context.Context, taskId string) (interface{}, error) { // octopus删除任务 req := &octopus.DeleteTrainJobReq{ Platform: o.platform, JobIds: []string{taskId}, } resp, err := o.octopusRpc.DeleteTrainJob(ctx, req) if err != nil { return nil, err } return resp, nil } func (o *OctopusLink) QuerySpecs(ctx context.Context) (interface{}, error) { // octopus查询资源规格 req := &octopus.GetResourceSpecsReq{ Platform: o.platform, ResourcePool: RESOURCE_POOL, } resp, err := o.octopusRpc.GetResourceSpecs(ctx, req) if err != nil { return nil, err } return resp, nil } func (o *OctopusLink) GetResourceStats(ctx context.Context) (*collector.ResourceStats, error) { req := &octopus.GetResourceSpecsReq{ Platform: o.platform, ResourcePool: RESOURCE_POOL, } specResp, err := o.octopusRpc.GetResourceSpecs(ctx, req) if err != nil { return nil, err } if !specResp.Success { return nil, errors.New(specResp.Error.Message) } balanceReq := &octopus.GetUserBalanceReq{ Platform: o.platform, } balanceResp, err := o.octopusRpc.GetUserBalance(ctx, balanceReq) if err != nil { return nil, err } if !balanceResp.Success { return nil, errors.New(balanceResp.Error.Message) } var cards []*collector.Card balance := float64(balanceResp.Payload.BillingUser.Amount) var cpuHours float64 for _, spec := range specResp.TrainResourceSpecs { if spec.Price == 0 { ns := strings.Split(spec.Name, COMMA) if len(ns) == 2 { nss := strings.Split(ns[0], COLON) if nss[0] == CPU { cpuHours = -1 } } } if spec.Price == 1 { ns := strings.Split(spec.Name, COMMA) cardSpecs := strings.Split(ns[0], STAR) cardTops, isMapContainsKey := cardTopsMap[cardSpecs[1]] if !isMapContainsKey { continue } card := &collector.Card{ Platform: OCTOPUS, Type: CARD, Name: cardSpecs[1], TOpsAtFp16: cardTops, CardHours: balance / spec.Price, } cards = append(cards, card) } } resourceStats := &collector.ResourceStats{ ClusterId: strconv.FormatInt(o.participantId, 10), Name: o.platform, Balance: balance, CardsAvail: cards, CpuCoreHours: cpuHours, } return resourceStats, nil } func (o *OctopusLink) GetDatasetsSpecs(ctx context.Context) ([]*collector.DatasetsSpecs, error) { req := &octopus.GetMyDatasetListReq{ Platform: o.platform, PageIndex: o.pageIndex, PageSize: o.pageSize, } resp, err := o.octopusRpc.GetMyDatasetList(ctx, req) if err != nil { return nil, err } if !resp.Success { return nil, errors.New(resp.Error.Message) } specs := []*collector.DatasetsSpecs{} for _, dataset := range resp.Payload.Datasets { spec := &collector.DatasetsSpecs{Name: dataset.Name} specs = append(specs, spec) } return specs, nil } func (o *OctopusLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm, error) { var algorithms []*collector.Algorithm req := &octopus.GetMyAlgorithmListReq{ Platform: o.platform, PageIndex: o.pageIndex, PageSize: o.pageSize, } resp, err := o.octopusRpc.GetMyAlgorithmList(ctx, req) if err != nil { return nil, err } if !resp.Success { return nil, errors.New("failed to get algorithms") } for _, a := range resp.Payload.Algorithms { algorithm := &collector.Algorithm{Name: a.AlgorithmName, Platform: OCTOPUS, TaskType: strings.ToLower(a.FrameworkName)} algorithms = append(algorithms, algorithm) } return algorithms, nil } func (o *OctopusLink) GetComputeCards(ctx context.Context) ([]string, error) { var cards []string for s, _ := range cardAliasMap { cards = append(cards, s) } return cards, nil } func (o *OctopusLink) GetUserBalance(ctx context.Context) (float64, error) { balanceReq := &octopus.GetUserBalanceReq{ Platform: o.platform, } balanceResp, err := o.octopusRpc.GetUserBalance(ctx, balanceReq) if err != nil { return 0, err } if !balanceResp.Success { if balanceResp.Error != nil { return 0, errors.New(balanceResp.Error.Message) } else { return 0, errors.New("failed to get user balance") } } balance := float64(balanceResp.Payload.BillingUser.Amount) return balance, nil } func (o *OctopusLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) { var name string if resourceType == CARD { name = dataset + UNDERSCORE + algorithm + UNDERSCORE + card } else { name = dataset + UNDERSCORE + algorithm + UNDERSCORE + CPU } req := &octopus.GetMyAlgorithmListReq{ Platform: o.platform, PageIndex: o.pageIndex, PageSize: o.pageSize, } resp, err := o.octopusRpc.GetMyAlgorithmList(ctx, req) if err != nil { return "", err } if !resp.Success { return "", errors.New("failed to get algorithmList") } var algorithmId string var algorithms []*octopus.Algorithms for _, a := range resp.Payload.Algorithms { if strings.ToLower(a.FrameworkName) != taskType { continue } if a.AlgorithmDescript == name { algorithms = append(algorithms, a) } } if len(algorithms) == 0 { return "", errors.New("algorithmId not found") } if len(algorithms) == 1 { algorithmId = algorithms[0].AlgorithmId } aLatest := &octopus.Algorithms{} for i, _ := range algorithms { if time.Unix(algorithms[i].CreatedAt, 0).After(time.Unix(aLatest.CreatedAt, 0)) { aLatest = algorithms[i] } } if aLatest.AlgorithmId == "" { return "", errors.New("algorithmId not found") } algorithmId = aLatest.AlgorithmId dcReq := &octopus.DownloadCompressReq{ Platform: o.platform, Version: VERSION, AlgorithmId: algorithmId, } dcResp, err := o.octopusRpc.DownloadCompress(ctx, dcReq) if err != nil { return "", err } if !dcResp.Success { return "", errors.New(dcResp.Error.Message) } daReq := &octopus.DownloadAlgorithmReq{ Platform: o.platform, Version: VERSION, AlgorithmId: algorithmId, CompressAt: dcResp.Payload.CompressAt, Domain: DOMAIN, } daResp, err := o.octopusRpc.DownloadAlgorithm(ctx, daReq) if err != nil { return "", err } if !daResp.Success { return "", errors.New(dcResp.Error.Message) } urlReq := &octopus.AlgorithmUrlReq{ Platform: o.platform, Url: daResp.Payload.DownloadUrl, } urlResp, err := o.octopusRpc.DownloadAlgorithmUrl(ctx, urlReq) if err != nil { return "", err } return urlResp.Algorithm, nil } func (o *OctopusLink) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error { //var name string //if resourceType == CARD { // name = dataset + UNDERSCORE + algorithm + UNDERSCORE + card //} else { // name = dataset + UNDERSCORE + algorithm + UNDERSCORE + CPU //} //uploadReq := &octopus.UploadAlgorithmReq{} return nil } func (o *OctopusLink) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) { instance, err := strconv.ParseInt(instanceNum, 10, 32) if err != nil { return "", err } req := &octopus.GetTrainJobLogReq{ Platform: o.platform, TaskId: taskId, TaskNum: "task0", Num: int32(instance), } resp, err := o.octopusRpc.GetTrainJobLog(ctx, req) if err != nil { return "", err } if strings.Contains(resp.Content, "404 Not Found") { resp.Content = "waiting for logs..." } return resp.Content, nil } func (o *OctopusLink) GetTrainingTask(ctx context.Context, taskId string) (*collector.Task, error) { resp, err := o.QueryTask(ctx, taskId) if err != nil { return nil, err } jobresp, ok := (resp).(*octopus.GetTrainJobResp) if !jobresp.Success || !ok { if jobresp.Error != nil { return nil, errors.New(jobresp.Error.Message) } else { return nil, errors.New("get training task failed, empty error returned") } } var task collector.Task task.Id = jobresp.Payload.TrainJob.Id if jobresp.Payload.TrainJob.StartedAt != 0 { task.Start = time.Unix(jobresp.Payload.TrainJob.StartedAt, 0).Format(constants.Layout) } if jobresp.Payload.TrainJob.CompletedAt != 0 { task.End = time.Unix(jobresp.Payload.TrainJob.CompletedAt, 0).Format(constants.Layout) } switch jobresp.Payload.TrainJob.Status { case "succeeded": task.Status = constants.Completed case "failed": task.Status = constants.Failed case "running": task.Status = constants.Running case "stopped": task.Status = constants.Stopped case "pending": task.Status = constants.Pending default: task.Status = "undefined" } return &task, nil } func (o *OctopusLink) Execute(ctx context.Context, option *option.AiOption) (interface{}, error) { err := o.GenerateSubmitParams(ctx, option) if err != nil { return nil, err } task, err := o.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType) if err != nil { return nil, err } return task, nil } func (o *OctopusLink) GenerateSubmitParams(ctx context.Context, option *option.AiOption) error { err := o.generateResourceId(ctx, option, nil) if err != nil { return err } err = o.generateDatasetsId(ctx, option) if err != nil { return err } err = o.generateImageId(ctx, option, nil) if err != nil { return err } err = o.generateAlgorithmId(ctx, option, nil) if err != nil { return err } err = o.generateCmd(option, nil) if err != nil { return err } err = o.generateEnv(option) if err != nil { return err } err = o.generateParams(option) if err != nil { return err } return nil } func (o *OctopusLink) generateResourceId(ctx context.Context, option *option.AiOption, ifoption *option.InferOption) error { req := &octopus.GetResourceSpecsReq{ Platform: o.platform, ResourcePool: RESOURCE_POOL, } specResp, err := o.octopusRpc.GetResourceSpecs(ctx, req) if err != nil { return err } if !specResp.Success { return errors.New(specResp.Error.Message) } if option != nil { err = generateResourceIdForTraining(option, specResp) if err != nil { return err } return nil } if ifoption != nil { err = generateResourceIdForInferDeployInstance(ifoption, specResp) if err != nil { return err } return nil } return errors.New("failed to set ResourceId") } func generateResourceIdForTraining(option *option.AiOption, specResp *octopus.GetResourceSpecsResp) error { if option.ResourceType == "" { return errors.New("ResourceType not set") } if option.ResourceType == CPU { for _, spec := range specResp.TrainResourceSpecs { if spec.Price == 0 { option.ResourceId = spec.Id return nil } } } if option.ResourceType == CARD { if option.ComputeCard == "" { option.ComputeCard = GCU } err := setResourceIdByCard(option, specResp, option.ComputeCard) if err != nil { return err } return nil } return errors.New("ResourceType not set") } func generateResourceIdForInferDeployInstance(option *option.InferOption, specResp *octopus.GetResourceSpecsResp) error { // temporarily use bi-v100 cardName, ok := cardCnMap[BIV100] if !ok { errors.New("computeCard not set") } // set computeCard option.ComputeCard = BIV100 for _, spec := range specResp.TrainResourceSpecs { names := strings.Split(spec.Name, COMMA) if len(names) != 4 { continue } ns := strings.Split(names[0], STAR) if len(ns) != 2 { continue } if ns[0] == "1" && ns[1] == cardName { option.ResourceId = spec.Id return nil } } return errors.New("failed to set ResourceId") } func (o *OctopusLink) generateDatasetsId(ctx context.Context, option *option.AiOption) error { if option.DatasetsName == "" { return errors.New("DatasetsName not set") } req := &octopus.GetMyDatasetListReq{ Platform: o.platform, PageIndex: o.pageIndex, PageSize: o.pageSize, } resp, err := o.octopusRpc.GetMyDatasetList(ctx, req) if err != nil { return err } if !resp.Success { return errors.New("failed to get DatasetsId") } for _, dataset := range resp.Payload.Datasets { if dataset.Name == option.DatasetsName { option.DatasetsId = dataset.Id return nil } } return errors.New("failed to get DatasetsId") } func (o *OctopusLink) generateImageId(ctx context.Context, option *option.AiOption, ifoption *option.InferOption) error { preImgReq := &octopus.GetPresetImageListReq{ Platform: o.platform, PageIndex: o.pageIndex, PageSize: o.pageSize, } preImgResp, err := o.octopusRpc.GetPresetImageList(ctx, preImgReq) if err != nil { return err } if !preImgResp.Success { return errors.New("failed to get PresetImages") } if option != nil { if option.TaskType == "" { return errors.New("TaskType not set") } req := &octopus.GetUserImageListReq{ Platform: o.platform, PageIndex: o.pageIndex, PageSize: o.pageSize, } resp, err := o.octopusRpc.GetUserImageList(ctx, req) if err != nil { return err } if !resp.Success { return errors.New("failed to get imageId") } if option.ResourceType == CPU { for _, img := range resp.Payload.Images { if img.Image.ImageName == "test-image" { option.ImageId = img.Image.Id return nil } } } err = generateImageIdForTraining(option, preImgResp) if err != nil { return err } return nil } if ifoption != nil { err = generateImageIdForInferDeployInstance(ifoption, preImgResp) if err != nil { return err } return nil } return errors.New("failed to get ImageId") } func generateImageIdForTraining(option *option.AiOption, preImgResp *octopus.GetPresetImageListResp) error { if option.ResourceType == CARD { for _, image := range preImgResp.Payload.Images { if strings.Contains(image.ImageName, cardAliasMap[strings.ToUpper(option.ComputeCard)]) { switch strings.ToUpper(option.ComputeCard) { case GCU: if strings.HasPrefix(image.ImageVersion, "t20_") { option.ImageId = image.Id return nil } case BIV100: if strings.HasPrefix(image.ImageVersion, "bi_") { option.ImageId = image.Id return nil } case MLU: option.ImageId = image.Id return nil } } } } return errors.New("failed to set ImageId") } func generateImageIdForInferDeployInstance(option *option.InferOption, preImgResp *octopus.GetPresetImageListResp) error { for _, image := range preImgResp.Payload.Images { // temporarily use bi-v100 if strings.Contains(image.ImageName, cardAliasMap[strings.ToUpper(BIV100)]) { switch strings.ToUpper(BIV100) { case GCU: if strings.HasPrefix(image.ImageVersion, "t20_") { option.ImageId = image.Id return nil } case BIV100: if strings.HasPrefix(image.ImageVersion, "bi_") { option.ImageId = image.Id return nil } case MLU: option.ImageId = image.Id return nil } } } return errors.New("failed to set ImageId") } func (o *OctopusLink) generateAlgorithmId(ctx context.Context, option *option.AiOption, ifoption *option.InferOption) error { req := &octopus.GetMyAlgorithmListReq{ Platform: o.platform, PageIndex: o.pageIndex, PageSize: o.pageSize, } resp, err := o.octopusRpc.GetMyAlgorithmList(ctx, req) if err != nil { return err } if !resp.Success { return errors.New("failed to get algorithmId") } if option != nil { err = generateAlgorithmIdForTraining(option, resp) if err != nil { return err } return nil } if ifoption != nil { err = generateAlgorithmIdForInferDeployInstance(ifoption, resp) if err != nil { return err } return nil } return errors.New("failed to set AlgorithmId") } func generateAlgorithmIdForTraining(option *option.AiOption, resp *octopus.GetMyAlgorithmListResp) error { for _, algorithm := range resp.Payload.Algorithms { if algorithm.FrameworkName == strings.Title(option.TaskType) { ns := strings.Split(algorithm.AlgorithmName, UNDERSCORE) if ns[0] != option.DatasetsName { continue } if ns[1] != option.AlgorithmName { continue } switch option.ResourceType { case CPU: if ns[2] != CPU { continue } case CARD: if ns[2] != strings.ToLower(option.ComputeCard) { continue } } option.AlgorithmId = algorithm.AlgorithmId return nil } } return errors.New("Algorithm does not exist") } func generateAlgorithmIdForInferDeployInstance(option *option.InferOption, resp *octopus.GetMyAlgorithmListResp) error { if option.ModelType == "" { return errors.New("ModelType not set") } if option.ModelName == "" { return errors.New("ModelName not set") } for _, algorithm := range resp.Payload.Algorithms { if strings.Contains(algorithm.AlgorithmName, option.ModelName) { option.AlgorithmId = algorithm.AlgorithmId return nil } } return errors.New("ModelName does not exist") } func (o *OctopusLink) generateCmd(option *option.AiOption, ifoption *option.InferOption) error { if option != nil { err := generateCmdForTraining(option) if err != nil { return err } return nil } if ifoption != nil { err := generateCmdForInferDeployInstance(ifoption) if err != nil { return err } return nil } return errors.New("failed to set cmd") } func generateCmdForTraining(option *option.AiOption) error { if option.Cmd == "" { switch option.ComputeCard { case GCU: option.Cmd = "cd /code; python3 train.py" case MLU: option.Cmd = ". /torch/venv3/pytorch/bin/activate; cd /code; python train.py" default: option.Cmd = TRAIN_CMD } } return nil } func generateCmdForInferDeployInstance(option *option.InferOption) error { if option.Cmd == "" { nameCmd, ok := CardModelNameCmdMap[option.ComputeCard] if !ok { return errors.New("failed to set cmd, ComputeCard not exist") } cmd, ok := nameCmd[option.ModelName] if !ok { return errors.New("failed to set cmd, ModelName not exist") } option.Cmd = cmd return nil } return nil } func (o *OctopusLink) generateEnv(option *option.AiOption) error { return nil } func (o *OctopusLink) generateParams(option *option.AiOption) error { if len(option.Params) == 0 { epoch := "epoch" + COMMA + "1" option.Params = append(option.Params, epoch) } return nil } func setResourceIdByCard(option *option.AiOption, specs *octopus.GetResourceSpecsResp, computeCard string) error { if option.Tops == 0 { for _, spec := range specs.TrainResourceSpecs { if spec.Price == 1 { ns := strings.Split(spec.Name, COMMA) cardSpecs := strings.Split(ns[0], STAR) if cardSpecs[1] == cardCnMap[strings.ToUpper(computeCard)] { option.ResourceId = spec.Id option.ComputeCard = computeCard return nil } } else { continue } } } else { cardNum := math.Ceil(option.Tops / float64(BASE_TOPS)) for _, spec := range specs.TrainResourceSpecs { if option.Tops < BASE_TOPS { if spec.Price == 1 { ns := strings.Split(spec.Name, COMMA) cardSpecs := strings.Split(ns[0], STAR) if cardSpecs[1] == cardCnMap[strings.ToUpper(computeCard)] { option.ResourceId = spec.Id option.ComputeCard = computeCard return nil } } else { continue } } else { ns := strings.Split(spec.Name, COMMA) if len(ns) != 4 { continue } cardSpecs := strings.Split(ns[0], STAR) if cardSpecs[1] != cardCnMap[strings.ToUpper(computeCard)] { continue } s, err := strconv.ParseFloat(cardSpecs[0], 64) if err != nil { return err } switch computeCard { case GCU: option.ComputeCard = computeCard if cardNum == s { // 1, 4, 8 option.ResourceId = spec.Id return nil } if 1 < cardNum && cardNum <= 4 && s == 4 { option.ResourceId = spec.Id return nil } if 4 < cardNum && s == 8 { option.ResourceId = spec.Id return nil } case MLU: // 1, 2, 4 option.ComputeCard = computeCard if cardNum/2 == s { option.ResourceId = spec.Id return nil } if 1 < cardNum/2 && cardNum/2 <= 2 && s == 2 { option.ResourceId = spec.Id return nil } if 2 < cardNum/2 && s == 4 { option.ResourceId = spec.Id return nil } } } } } return errors.New("set ResourceId error") } func (o *OctopusLink) GetClusterInferUrl(ctx context.Context, option *option.InferOption) (*inference.ClusterInferUrl, error) { req := &octopus.GetNotebookListReq{ Platform: o.platform, PageIndex: o.pageIndex, PageSize: o.pageSize, SearchKey: DEPLOY_INSTANCE_PREFIEX, } list, err := o.octopusRpc.GetNotebookList(ctx, req) if err != nil { return nil, err } var imageUrls []*inference.InferUrl for _, notebook := range list.Payload.GetNotebooks() { if strings.Contains(notebook.Desc, option.ModelName) && notebook.Status == "running" { url := strings.Replace(notebook.Tasks[0].Url, FORWARD_SLASH, "", -1) names := strings.Split(notebook.Desc, FORWARD_SLASH) imageUrl := &inference.InferUrl{ Url: DOMAIN + url, Card: names[2], } imageUrls = append(imageUrls, imageUrl) } else { continue } } if len(imageUrls) == 0 { return nil, errors.New("no infer url available") } clusterWithUrl := &inference.ClusterInferUrl{ ClusterName: o.platform, ClusterType: TYPE_OCTOPUS, InferUrls: imageUrls, } return clusterWithUrl, nil } func (o *OctopusLink) GetInferDeployInstanceList(ctx context.Context) ([]*inference.DeployInstance, error) { var insList []*inference.DeployInstance req := &octopus.GetNotebookListReq{ Platform: o.platform, PageIndex: o.pageIndex, PageSize: o.pageSize, SearchKey: DEPLOY_INSTANCE_PREFIEX, } list, err := o.octopusRpc.GetNotebookList(ctx, req) if err != nil { return nil, err } if list.Error != nil { return nil, errors.New(list.Error.Message) } for _, notebook := range list.Payload.Notebooks { ins := &inference.DeployInstance{} ins.InstanceName = notebook.Name ins.InstanceId = notebook.Id ins.ClusterName = o.platform ins.Status = notebook.Status ins.ClusterType = TYPE_OCTOPUS insList = append(insList, ins) } return insList, nil } func (o *OctopusLink) StartInferDeployInstance(ctx context.Context, id string) bool { req := &octopus.StartNotebookReq{ Platform: o.platform, Id: id, } resp, err := o.octopusRpc.StartNotebook(ctx, req) if err != nil || !resp.Success { return false } return resp.Success } func (o *OctopusLink) StopInferDeployInstance(ctx context.Context, id string) bool { req := &octopus.StopNotebookReq{ Platform: o.platform, Id: id, } resp, err := o.octopusRpc.StopNotebook(ctx, req) if err != nil || !resp.Success { return false } return resp.Success } func (o *OctopusLink) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) { ins := &inference.DeployInstance{} req := &octopus.GetNotebookReq{ Platform: o.platform, Id: id, } resp, err := o.octopusRpc.GetNotebook(ctx, req) if err != nil { return nil, err } if resp.Payload == nil { return nil, errors.New("instance does not exist") } url := strings.Replace(resp.Payload.Notebook.Tasks[0].Url, FORWARD_SLASH, "", -1) inferUrl := DOMAIN + url var modelType string var modelName string var card string if resp.Payload.Notebook.Desc != "" { str := strings.Split(resp.Payload.Notebook.Desc, FORWARD_SLASH) if len(str) == 3 { modelType = str[0] modelName = str[1] card = str[2] } } ins.InstanceName = resp.Payload.Notebook.Name ins.InstanceId = resp.Payload.Notebook.Id ins.ClusterName = o.platform ins.Status = resp.Payload.Notebook.Status ins.ClusterType = TYPE_OCTOPUS ins.ModelType = modelType ins.ModelName = modelName ins.InferUrl = inferUrl ins.InferCard = card return ins, nil } func (o *OctopusLink) GetImageInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) { stream, err := o.octopusRpc.GetInferResult(ctx) if err != nil { return "", err } buffer := make([]byte, 2048) bufferedReader := bufio.NewReader(file) for { _, err = bufferedReader.Read(buffer) if err != nil { if err != io.EOF { return "", err } break } err = stream.Send(&octopus.InferResultReq{ Platform: o.platform, InferUrl: url, FileName: fileName, FileBytes: buffer, }) } recv, err := stream.CloseAndRecv() if err != nil { return "", err } return recv.Result, nil } func (o *OctopusLink) CreateInferDeployInstance(ctx context.Context, option *option.InferOption) (string, error) { err := o.generateResourceId(ctx, nil, option) if err != nil { return "", err } err = o.generateAlgorithmId(ctx, nil, option) if err != nil { return "", err } err = o.generateImageId(ctx, nil, option) if err != nil { return "", err } err = o.generateCmd(nil, option) if err != nil { return "", err } desc := option.ModelType + FORWARD_SLASH + option.ModelName + FORWARD_SLASH + strings.ToLower(BIV100) param := &octopus.CreateNotebookParam{ Name: DEPLOY_INSTANCE_PREFIEX + DASH + utils.TimeString(), ResourcePool: RESOURCE_POOL, ResourceSpecId: option.ResourceId, AlgorithmId: option.AlgorithmId, AlgorithmVersion: VERSION, ImageId: option.ImageId, DatasetId: "", DatasetVersion: "", Command: option.Cmd, Desc: desc, TaskNumber: 1, } req := &octopus.CreateNotebookReq{ Platform: o.platform, Params: param, } resp, err := o.octopusRpc.CreateNotebook(ctx, req) if err != nil { return "", err } if !resp.Success { return "", errors.New(resp.Error.Message) } return resp.Payload.Id, nil } func (o *OctopusLink) CheckModelExistence(ctx context.Context, name string, mtype string) bool { ifoption := &option.InferOption{ ModelName: name, ModelType: mtype, } err := o.generateAlgorithmId(ctx, nil, ifoption) if err != nil { return false } return true }