|
- /*
-
- Copyright (c) [2023] [pcm]
- [pcm-coordinator] is licensed under Mulan PSL v2.
- You can use this software according to the terms and conditions of the Mulan PSL v2.
- You may obtain a copy of Mulan PSL v2 at:
- http://license.coscl.org.cn/MulanPSL2
- THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
- EITHER EXPaRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
- MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
- See the Mulan PSL v2 for more details.
-
- */
-
- package storeLink
-
- import (
- "context"
- "fmt"
- "github.com/pkg/errors"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/common"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/executor"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference"
- "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
- "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
- "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils/timeutils"
- "gitlink.org.cn/JointCloud/pcm-modelarts/client/imagesservice"
- "gitlink.org.cn/JointCloud/pcm-modelarts/client/modelartsservice"
- "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts"
- modelartsclient "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts"
- "io"
- "k8s.io/apimachinery/pkg/util/json"
- "log"
- "mime/multipart"
- "regexp"
- "strconv"
- "strings"
- "sync"
- "time"
- )
-
- const (
- Ascend = "Ascend"
- Npu = "npu"
- ImageNetResnet50Cmd = "cd /home/ma-user & python ./inference_ascend.py"
- ChatGLM6BCmd = "cd /home/ma-user && python ./download_model.py && python ./inference_chatGLM.py"
- ASCEND = "ASCEND910"
- )
-
- type ModelArtsLink struct {
- modelArtsRpc modelartsservice.ModelArtsService
- modelArtsImgRpc imagesservice.ImagesService
- platform string
- participantId int64
- pageIndex int32
- pageSize int32
- SourceLocation string
- Version string
- ModelId string
- ModelType string
- }
-
- type MoUsage struct {
- CpuSize int64
- NpuSize int64
- MemorySize int64
- VMemorySize int64
- VMemoryNumber int64
- CpuAvailable int64
- NpuAvailable int64
- MemoryAvailable int64
- VMemoryAvailable int64
- }
-
- // Version 结构体表示版本号
- type Version struct {
- Major, Minor, Patch int
- }
-
- // ParseVersion 从字符串解析版本号
- func ParseVersion(versionStr string) (*Version, error) {
- parts := strings.Split(versionStr, ".")
- if len(parts) != 3 {
- return nil, fmt.Errorf("invalid version format: %s", versionStr)
- }
-
- major, err := strconv.Atoi(parts[0])
- if err != nil {
- return nil, err
- }
-
- minor, err := strconv.Atoi(parts[1])
- if err != nil {
- return nil, err
- }
-
- patch, err := strconv.Atoi(parts[2])
- if err != nil {
- return nil, err
- }
-
- return &Version{Major: major, Minor: minor, Patch: patch}, nil
- }
-
- // Increment 根据给定规则递增版本号
- func (v *Version) Increment() {
- if v.Patch < 9 {
- v.Patch++
- } else {
- v.Patch = 0
- if v.Minor < 9 {
- v.Minor++
- } else {
- v.Minor = 0
- v.Major++
- }
- }
- }
-
- // String 将版本号转换回字符串格式
- func (v *Version) String() string {
- return fmt.Sprintf("%d.%d.%d", v.Major, v.Minor, v.Patch)
- }
-
- func NewModelArtsLink(modelArtsRpc modelartsservice.ModelArtsService, modelArtsImgRpc imagesservice.ImagesService, name string, id int64, nickname string) *ModelArtsLink {
- return &ModelArtsLink{modelArtsRpc: modelArtsRpc, modelArtsImgRpc: modelArtsImgRpc, platform: nickname, participantId: id, pageIndex: 0, pageSize: 50}
- }
-
- func (m *ModelArtsLink) UploadImage(ctx context.Context, path string) (interface{}, error) {
- //TODO modelArts上传镜像
- return nil, nil
- }
-
- func (m *ModelArtsLink) DeleteImage(ctx context.Context, imageId string) (interface{}, error) {
- // TODO modelArts删除镜像
- return nil, nil
- }
-
- func (m *ModelArtsLink) QueryImageList(ctx context.Context) (interface{}, error) {
- // modelArts获取镜像列表
- req := &modelarts.ListRepoReq{
- Offset: "0",
- Limit: strconv.Itoa(int(m.pageSize)),
- Platform: m.platform,
- }
- resp, err := m.modelArtsImgRpc.ListReposDetails(ctx, req)
- if err != nil {
- return nil, err
- }
-
- return resp, nil
- }
-
- func (m *ModelArtsLink) SubmitTask(ctx context.Context, imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string, aiType string) (interface{}, error) {
- // modelArts提交任务
- environments := make(map[string]string)
- parameters := make([]*modelarts.ParametersTrainJob, 0)
- //parameters2 := make([]*modelarts.ParametersTrainJob, 0)
- inputs := make([]*modelarts.InputTraining, 0)
- outputs := make([]*modelarts.OutputTraining, 0)
- outputValue := ""
- for _, env := range envs {
- // 找到第一个逗号位置
- idx := strings.Index(env, COMMA)
- if idx == -1 {
- continue
- }
- key := strings.TrimSpace(env[:idx])
- value := strings.TrimSpace(env[idx+1:])
- environments[key] = value
- }
- for _, param := range params {
- s := strings.Split(param, COMMA)
- parameters = append(parameters, &modelarts.ParametersTrainJob{
- Name: s[0],
- Value: s[1],
- })
- if s[0] == "output" {
- outputValue = s[1]
- }
- }
- if len(datasetsId) != 0 {
- inputs = append(inputs, &modelarts.InputTraining{
- Name: "input",
- AccessMethod: "parameter",
- Remote: &modelarts.RemoteTra{
- Obs: &modelarts.ObsTra{
- ObsUrl: datasetsId + "/",
- },
- }})
- }
- if len(outputValue) != 0 {
- outputs = append(outputs, &modelarts.OutputTraining{
- Name: "output",
- Remote: &modelarts.RemoteOut{
- Obs: &modelarts.ObsTra{
- ObsUrl: "obs:/" + outputValue + "/",
- },
- }})
- }
- req := &modelarts.CreateTrainingJobReq{
- Kind: "job",
- Metadata: &modelarts.MetadataS{
- Name: TASK_NAME_PREFIX + utils.RandomString(10),
- WorkspaceId: "0",
- },
- Algorithm: &modelarts.Algorithms{
- Id: algorithmId,
- Engine: &modelarts.EngineCreateTraining{
- ImageUrl: imageId,
- },
- Command: cmd,
- Environments: environments,
- Parameters: parameters,
- Inputs: inputs,
- Outputs: outputs,
- },
- Spec: &modelarts.SpecsC{
- Resource: &modelarts.ResourceCreateTraining{
- FlavorId: resourceId,
- NodeCount: 1,
- },
- },
- Platform: m.platform,
- }
- marshal, err2 := json.Marshal(req)
- if err2 != nil {
-
- }
- println(string(marshal))
- resp, err := m.modelArtsRpc.CreateTrainingJob(ctx, req)
- if err != nil {
- return nil, err
- }
-
- if resp.ErrorMsg != "" {
- return nil, errors.New(resp.ErrorMsg)
- }
-
- return resp, nil
- }
-
- func (m *ModelArtsLink) QueryTask(ctx context.Context, taskId string) (interface{}, error) {
- // 获取任务
- req := &modelarts.DetailTrainingJobsReq{
- TrainingJobId: taskId,
- Platform: m.platform,
- }
- resp, err := m.modelArtsRpc.GetTrainingJobs(ctx, req)
- if err != nil {
- return nil, err
- }
-
- return resp, nil
- }
-
- func (m *ModelArtsLink) DeleteTask(ctx context.Context, taskId string) (interface{}, error) {
- // 删除任务
- req := &modelarts.DeleteTrainingJobReq{
- TrainingJobId: taskId,
- Platform: m.platform,
- }
- resp, err := m.modelArtsRpc.DeleteTrainingJob(ctx, req)
- if err != nil {
- return nil, err
- }
-
- return resp, nil
- }
-
- func (m *ModelArtsLink) QuerySpecs(ctx context.Context) (interface{}, error) {
- // modelarts查询资源规格
- req := &modelarts.TrainingJobFlavorsReq{
- Platform: m.platform,
- }
- resp, err := m.modelArtsRpc.GetTrainingJobFlavors(ctx, req)
- if err != nil {
- return nil, err
- }
-
- return resp, nil
- }
-
- func (m *ModelArtsLink) GetResourceStats(ctx context.Context) (*collector.ResourceStats, error) {
- req := &modelarts.GetPoolsRuntimeMetricsReq{}
- resp, err := m.modelArtsRpc.GetPoolsRuntimeMetrics(ctx, req)
- if err != nil {
- return nil, err
- }
- if resp.ErrorMsg != "" {
- return nil, errors.New("failed to get algorithms")
- }
- resourceStats := &collector.ResourceStats{}
- CpuCoreTotalSum := int64(0)
- CpuCoreAvailSum := int64(0)
- MemTotalSum := float64(0)
- MemAvailSum := float64(0)
- var CpuCoreTotal int64
- var CpuCoreAvail int64
- var MemTotal float64
- var MemAvail float64
- for _, items := range resp.Items {
- //TODO The value of taskType is temporarily fixed to "pytorch"
- CpuCoreTotal, err = strconv.ParseInt(items.Table.Capacity.Value.Cpu, 10, 64)
- CpuCoreTotalSum += CpuCoreTotal
- CpuCoreAvail, err = strconv.ParseInt(items.Table.Allocated.Value.Cpu, 10, 64)
- CpuCoreAvailSum += CpuCoreAvail
- MemTotal, err = strconv.ParseFloat(items.Table.Capacity.Value.Memory, 64)
- MemTotalSum += MemTotal
- MemAvail, err = strconv.ParseFloat(items.Table.Allocated.Value.Memory, 64)
- MemAvailSum += MemAvail
- }
- resourceStats.CpuCoreTotal = CpuCoreTotalSum
- resourceStats.CpuCoreAvail = CpuCoreAvailSum
- resourceStats.MemTotal = MemTotalSum
- resourceStats.MemAvail = MemAvailSum
- req1 := &modelarts.GetResourceFlavorsReq{}
- resp1, err := m.modelArtsRpc.GetResourceFlavors(ctx, req1)
-
- num32, _ := strconv.Atoi(resp1.Items[0].Spec.Npu.Size)
- var cards []*collector.Card
- card := &collector.Card{
- Platform: MODELARTS,
- Type: CARD,
- Name: Npu,
- CardNum: int32(num32),
- TOpsAtFp16: float64(num32 * 320),
- }
- cards = append(cards, card)
- resourceStats.CardsAvail = cards
-
- return resourceStats, nil
- }
-
- func (m *ModelArtsLink) GetDatasetsSpecs(ctx context.Context) ([]*collector.DatasetsSpecs, error) {
- return nil, nil
- }
-
- func (m *ModelArtsLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm, error) {
- var algorithms []*collector.Algorithm
-
- req := &modelarts.ListAlgorithmsReq{
- Platform: m.platform,
- Offset: m.pageIndex,
- Limit: m.pageSize,
- }
- resp, err := m.modelArtsRpc.ListAlgorithms(ctx, req)
- if err != nil {
- return nil, err
- }
- if resp.ErrorMsg != "" {
- return nil, errors.New("failed to get algorithms")
- }
-
- for _, a := range resp.Items {
- //TODO The value of taskType is temporarily fixed to "pytorch"
- algorithm := &collector.Algorithm{Name: a.Metadata.Name, Platform: MODELARTS, TaskType: "pytorch"}
- algorithms = append(algorithms, algorithm)
- }
- return algorithms, nil
- }
-
- func (m *ModelArtsLink) GetComputeCards(ctx context.Context) ([]string, error) {
- var cards []string
- cards = append(cards, Ascend)
- return cards, nil
- }
-
- func (m *ModelArtsLink) GetUserBalance(ctx context.Context) (float64, error) {
- return 0, nil
- }
-
- func (m *ModelArtsLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) {
- algoName := dataset + DASH + algorithm
- req := &modelarts.GetFileReq{
- Path: algoName + FORWARD_SLASH + TRAIN_FILE,
- }
- resp, err := m.modelArtsRpc.GetFile(ctx, req)
- if err != nil {
- return "", err
- }
-
- return string(resp.Content), nil
- }
-
- func (m *ModelArtsLink) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error {
- return nil
- }
-
- // Determine whether there is a necessary image in image management and query the image name based on the image name
- func (m *ModelArtsLink) getSourceLocationFromImages(ctx context.Context, option *option.InferOption) error {
- req := &modelarts.ListImagesReq{
- //Platform: m.platform,
- Limit: 50,
- Offset: 0,
- }
-
- ListImagesResp, err := m.modelArtsRpc.ListImages(ctx, req)
- if err != nil {
- return err
- }
- if ListImagesResp.Code != 200 {
- return errors.New("failed to get ListImages")
- }
-
- for _, ListImages := range ListImagesResp.Data {
- if option.ModelName == "ChatGLM-6B" {
- if ListImages.Name == "chatglm-6b" {
- m.SourceLocation = ListImages.SwrPath
- return nil
- }
- } else {
- if ListImages.Name == option.ModelName {
- m.SourceLocation = ListImages.SwrPath
- return nil
- }
- }
- }
- return errors.New("SourceLocation not set")
- }
-
- // Get AI Application List
- func (m *ModelArtsLink) GetModelId(ctx context.Context, option *option.InferOption) error {
- req := &modelarts.ListModelReq{
- Platform: m.platform,
- ModelName: option.ModelName,
- //ModelType: "Image",
- Limit: int64(m.pageIndex),
- Offset: int64(m.pageSize),
- }
- ListModelResp, err := m.modelArtsRpc.ListModels(ctx, req)
- if err != nil {
- return err
- }
- if ListModelResp.Code == 200 {
- //return errors.New("failed to get ModelId")
- for _, ListModel := range ListModelResp.Models {
- if ListModel.ModelName == option.ModelName {
- option.ModelId = ListModel.ModelId
- m.Version = ListModel.ModelVersion
- return nil
- }
- }
-
- }
- err = m.CreateModel(ctx, option)
- if err != nil {
- return err
- }
- return nil
- }
-
- func (m *ModelArtsLink) GetModel(ctx context.Context, option *option.InferOption) string {
- req := &modelarts.ShowModelReq{
- Platform: m.platform,
- ModelId: option.ModelID,
- }
- ctx, cancel := context.WithTimeout(context.Background(), 50*time.Second)
- defer cancel()
- ShowModelsResp, err := m.modelArtsRpc.ShowModels(ctx, req)
- if err != nil {
- if err == context.DeadlineExceeded {
- log.Println("Request timed out")
- // 重试请求或其他处理
- } else {
- log.Fatalf("could not call method: %v", err)
- }
- }
- if ShowModelsResp.Code != 200 {
- errors.New("failed to get findModelsStatus")
- }
- m.ModelType = ShowModelsResp.ShowModelDetail.ModelAlgorithm
- return ShowModelsResp.ShowModelDetail.ModelStatus
- }
-
- // Get AI Application List
- func (m *ModelArtsLink) GetModelStatus(ctx context.Context, option *option.InferOption) error {
- var wg sync.WaitGroup
- wg.Add(1)
- // 使用goroutine进行轮询
- //defer wg.Done()
- for {
- status := m.GetModel(ctx, option)
- if status == "published" {
- fmt.Println("Model is now published.")
- break // 一旦状态变为published,就退出循环
- }
- fmt.Println("Waiting for model to be published...")
- time.Sleep(5 * time.Second) // 等待一段时间后再次检查
- }
- // 在这里执行模型状态为published后需要进行的操作
- fmt.Println("Continuing with the program...")
- return nil
- }
-
- // Create an AI application
- func (m *ModelArtsLink) CreateModel(ctx context.Context, option *option.InferOption) error {
- //Before creating an AI application, check if there are any images that can be created
- err := m.getSourceLocationFromImages(ctx, option)
- if err != nil { //
- return errors.New("No image available for creationd")
- }
-
- //
- var CMD string
- if option.ModelName == "imagenet_resnet50" {
- CMD = ImageNetResnet50Cmd
- } else if option.ModelName == "ChatGLM-6B" {
- CMD = ChatGLM6BCmd
- }
-
- if m.Version == "" {
- m.Version = "0.0.1"
- }
- version, err := ParseVersion(m.Version)
- version.Increment()
- req := &modelarts.CreateModelReq{
- Platform: m.platform,
- ModelName: option.ModelName,
- ModelType: "Image",
- ModelVersion: version.String(),
- SourceLocation: m.SourceLocation,
- InstallType: []string{"real-time"},
- Cmd: CMD,
- ModelAlgorithm: option.ModelType,
- }
- ModelResp, err := m.modelArtsRpc.CreateModel(ctx, req)
- if err != nil {
- return err
- }
- if ModelResp.Code != 200 {
- return errors.New("failed to get ModelId")
- }
- option.ModelId = ModelResp.ModelId
-
- return nil
- }
-
- func (m *ModelArtsLink) GetSpecifications(ctx context.Context, option *option.AiOption, ifoption *option.InferOption) error {
- req := &modelarts.ListSpecificationsReq{
- //Platform: m.platform,
- IsPersonalCluster: false,
- InferType: "real-time",
- Limit: m.pageIndex,
- OffSet: m.pageSize,
- }
- ListSpecificationsResp, err := m.modelArtsRpc.ListSpecifications(ctx, req)
- if err != nil {
- return err
- }
- for _, ListSpecifications := range ListSpecificationsResp.Specifications {
- if ListSpecifications.Specification == "modelarts.kat1.xlarge" {
- ifoption.Specification = ListSpecifications.Specification
- return nil
- }
- }
- return nil
- }
-
- func (m *ModelArtsLink) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) {
- req := &modelartsservice.GetTrainingJobLogsPreviewReq{
- Platform: m.platform,
- TaskId: "worker-0",
- TrainingJobId: taskId,
- }
-
- //resp, err := m.modelArtsRpc.GetTrainingJobLogsPreview(ctx, req)
- stream, err := m.modelArtsRpc.GetTrainingJobLogStream(ctx, req)
- if err != nil {
- log.Fatalf("error calling StreamLogs: %v", err)
- }
-
- var fullLog string
- for {
- // 接收服务端发送的日志块
- logEntry, err := stream.Recv()
- if err == io.EOF {
- // 流结束
- break
- }
- if err != nil {
- log.Fatalf("接收日志块失败: %v", err)
- }
- // 拼接日志块
- fullLog += logEntry.Message
- }
-
- return fullLog, nil
-
- /* if strings.Contains(resp.Content, "404 Not Found") {
- = "waiting for logs..."
- }*/
- //return resp.Content, nil
- }
-
- func (m *ModelArtsLink) GetTrainingTask(ctx context.Context, taskId string) (*collector.Task, error) {
- resp, err := m.QueryTask(ctx, taskId)
- if err != nil {
- return nil, err
- }
- jobresp, ok := (resp).(*modelartsservice.JobResponse)
- if jobresp.ErrorMsg != "" || !ok {
- if jobresp.ErrorMsg != "" {
- return nil, errors.New(jobresp.ErrorMsg)
- } else {
- return nil, errors.New("get training task failed, empty error returned")
- }
- }
- var task collector.Task
- task.Id = jobresp.Metadata.Id
-
- switch strings.ToLower(jobresp.Status.Phase) {
- case "completed":
- milliTimestamp := int64(jobresp.Status.StartTime)
- task.Start = timeutils.MillisecondsToUTCString(milliTimestamp, time.DateTime)
- duration := int64(jobresp.Status.Duration)
- task.End = timeutils.MillisecondsToAddDurationToUTCString(milliTimestamp, duration, time.DateTime)
- task.Status = constants.Completed
- case "failed":
- milliTimestamp := int64(jobresp.Status.StartTime)
- task.Start = timeutils.MillisecondsToUTCString(milliTimestamp, time.DateTime)
- duration := int64(jobresp.Status.Duration)
- task.End = timeutils.MillisecondsToAddDurationToUTCString(milliTimestamp, duration, time.DateTime)
- task.Status = constants.Failed
- case "running":
- milliTimestamp := int64(jobresp.Status.StartTime)
- task.Start = timeutils.MillisecondsToUTCString(milliTimestamp, time.DateTime)
- task.Status = constants.Running
- case "stopped":
- task.Status = constants.Stopped
- case "pending":
- task.Status = constants.Pending
- case "terminated":
- //TODO Failed
- task.Status = constants.Failed
- default:
- task.Status = "undefined"
- }
-
- return &task, nil
- }
-
- func (m *ModelArtsLink) Execute(ctx context.Context, option *option.AiOption, mode int) (interface{}, error) {
- switch mode {
- case executor.SUBMIT_MODE_JOINT_CLOUD:
- err := m.GenerateSubmitParams(ctx, option)
- if err != nil {
- return nil, err
- }
- case executor.SUBMIT_MODE_STORAGE_SCHEDULE:
- var ascendNum int32
- for _, res := range option.ResourcesRequired {
- typeName, ok := res["type"]
- if !ok {
- continue
- }
- switch typeName {
- case "NPU":
- num, ok := res["number"]
- if !ok {
- continue
- }
- n := common.ConvertTypeToString(num)
- val, err := strconv.ParseInt(n, 10, 32)
- if err != nil {
- return nil, err
- }
-
- ascendNum = int32(val)
- }
- }
-
- req := &modelarts.TrainingJobFlavorsReq{
- Platform: "modelarts-CloudBrain2",
- FlavorType: "",
- }
- resp, err := m.modelArtsRpc.GetTrainingJobFlavors(ctx, req)
- for _, v := range resp.Flavors {
- if ascendNum == v.FlavorInfo.Npu.UnitNum {
- option.ResourceId = v.FlavorId
- break
- } else if ascendNum <= 1 {
- option.ResourceId = "modelarts.kat1.xlarge"
- break
- } else if ascendNum == 2 {
- option.ResourceId = "modelarts.kat1.2xlarge"
- break
- } else if ascendNum > 2 && ascendNum <= 4 {
- option.ResourceId = "modelarts.kat1.4xlarge"
- break
- } else if ascendNum >= 5 && ascendNum <= 8 {
- option.ResourceId = "modelarts.kat1.8xlarge"
- break
- } else if ascendNum > 8 {
- option.ResourceId = "modelarts.kat1.8xlarge"
- break
- }
- }
- if err != nil {
- return nil, err
- }
- option.ComputeCard = NPU
- default:
- return nil, errors.New("failed to choose submit mode")
- }
- task, err := m.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType)
- if err != nil {
- return nil, err
- }
- return task, nil
- }
-
- func (m *ModelArtsLink) GenerateSubmitParams(ctx context.Context, option *option.AiOption) error {
- err := m.generateResourceId(ctx, option, nil)
- if err != nil {
- return err
- }
- err = m.generateAlgorithmId(ctx, option)
- if err != nil {
- return err
- }
- err = m.generateImageId(option)
- if err != nil {
- return err
- }
- err = m.generateCmd(option)
- if err != nil {
- return err
- }
- err = m.generateEnv(option)
- if err != nil {
- return err
- }
- err = m.generateParams(option)
- if err != nil {
- return err
- }
- return nil
- }
-
- func (m *ModelArtsLink) generateResourceId(ctx context.Context, option *option.AiOption, ifoption *option.InferOption) error {
- option.ResourceId = "modelarts.kat1.xlarge"
- return nil
- }
-
- func (m *ModelArtsLink) generateImageId(option *option.AiOption) error {
-
- return nil
- }
-
- func (m *ModelArtsLink) generateCmd(option *option.AiOption) error {
-
- return nil
- }
-
- func (m *ModelArtsLink) generateEnv(option *option.AiOption) error {
-
- return nil
- }
-
- func (m *ModelArtsLink) generateParams(option *option.AiOption) error {
-
- return nil
- }
-
- func (m *ModelArtsLink) generateAlgorithmId(ctx context.Context, option *option.AiOption) error {
- req := &modelarts.ListAlgorithmsReq{
- Platform: m.platform,
- Offset: m.pageIndex,
- Limit: m.pageSize,
- }
- resp, err := m.modelArtsRpc.ListAlgorithms(ctx, req)
- if err != nil {
- return err
- }
- if resp.ErrorMsg != "" {
- return errors.New("failed to get algorithmId")
- }
-
- for _, algorithm := range resp.Items {
- engVersion := algorithm.JobConfig.Engine.EngineVersion
- if strings.Contains(engVersion, option.TaskType) {
- ns := strings.Split(algorithm.Metadata.Name, DASH)
- if ns[0] != option.TaskType {
- continue
- }
- if ns[1] != option.DatasetsName {
- continue
- }
- if ns[2] != option.AlgorithmName {
- continue
- }
- option.AlgorithmId = algorithm.Metadata.Id
- return nil
- }
- }
-
- if option.AlgorithmId == "" {
- return errors.New("Algorithm does not exist")
- }
-
- return errors.New("failed to get AlgorithmId")
- }
-
- func (m *ModelArtsLink) GetClusterInferUrl(ctx context.Context, option *option.InferOption) (*inference.ClusterInferUrl, error) {
- var imageUrls []*inference.InferUrl
- urlReq := &modelartsclient.ImageReasoningUrlReq{
- ServiceName: option.ModelName,
- Type: option.ModelType,
- Card: "npu",
- }
- urlResp, err := m.modelArtsRpc.ImageReasoningUrl(ctx, urlReq)
- if err != nil {
- return nil, err
- }
- imageUrl := &inference.InferUrl{
- Url: urlResp.Url,
- Card: "npu",
- }
- imageUrls = append(imageUrls, imageUrl)
-
- clusterWithUrl := &inference.ClusterInferUrl{
- ClusterName: m.platform,
- ClusterType: TYPE_MODELARTS,
- InferUrls: imageUrls,
- }
- return clusterWithUrl, nil
- }
-
- func (m *ModelArtsLink) GetInferDeployInstanceList(ctx context.Context) ([]*inference.DeployInstance, error) {
- var insList []*inference.DeployInstance
- req := &modelarts.ListServicesReq{
- Platform: m.platform,
- OffSet: m.pageIndex,
- Limit: m.pageSize,
- }
- //list, err := m.modelArtsRpc.ListServices(ctx, req)
- resp, err := m.modelArtsRpc.ListServices(ctx, req)
- if err != nil {
- return nil, err
- }
- if resp.ErrorMsg != "" {
- return nil, errors.New(resp.Msg)
- }
-
- for _, services := range resp.Services {
- ins := &inference.DeployInstance{}
- ins.InstanceName = services.ServiceName
- ins.InstanceId = services.ServiceId
- ins.Status = services.Status
- ins.InferCard = "NPU"
- ins.ClusterName = m.platform
- ins.CreatedTime = string(services.StartTime)
- ins.ClusterType = TYPE_MODELARTS
- insList = append(insList, ins)
- }
-
- return insList, nil
- }
-
- func (m *ModelArtsLink) StartInferDeployInstance(ctx context.Context, id string) bool {
- req := &modelartsclient.UpdateServiceReq{
- ServiceId: id,
- Status: "running",
- }
- resp, err := m.modelArtsRpc.UpdateService(ctx, req)
- if err != nil || resp.Code != 0 {
- return false
- }
- if resp.Code == 0 {
- return true
- }
- return false
- }
-
- func (m *ModelArtsLink) StopInferDeployInstance(ctx context.Context, id string) bool {
- req := &modelartsclient.UpdateServiceReq{
- ServiceId: id,
- Status: "stopped",
- }
- resp, err := m.modelArtsRpc.UpdateService(ctx, req)
- if err != nil || resp.Code != 0 {
- return false
- }
- if resp.Code == 0 {
- return true
- }
- return false
- }
-
- func (m *ModelArtsLink) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) {
- req := &modelarts.ShowServiceReq{
- ServiceId: id,
- }
- resp, err := m.modelArtsRpc.ShowService(ctx, req)
- if err != nil {
- return nil, err
- }
- /* if resp.ErrorMsg != "" {
- return nil, errors.New(resp.Msg)
- }*/
- ins := &inference.DeployInstance{}
- ins.InstanceName = resp.ServiceName
- ins.InstanceId = resp.ServiceId
- ins.Status = resp.Status
- ins.InferCard = "NPU"
- ins.ClusterName = m.platform
- ins.CreatedTime = string(resp.StartTime)
- ins.ClusterType = TYPE_MODELARTS
- if resp.Config != nil {
- ins.ModelName = resp.Config[0].ModelName
- }
- if m.ModelType != "" {
- ins.ModelType = m.ModelType
- }
- ins.InferUrl = resp.AccessAddress
- return ins, nil
- }
-
- func (m *ModelArtsLink) GetImageInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) {
- return "", nil
- }
-
- func (m *ModelArtsLink) CreateInferDeployInstance(ctx context.Context, option *option.InferOption) (string, error) {
-
- /* err := m.GetModelId(ctx, option)
- if err != nil {
- return "", err
- }*/
-
- err := m.GetModelStatus(ctx, option)
- if err != nil {
- return "模型状态查询错误", err
- }
-
- configParam := &modelarts.ServiceConfig{
- Specification: "modelarts.kat1.xlarge",
- Weight: 100,
- ModelId: option.ModelID,
- InstanceCount: 1,
- }
- var configItems []*modelarts.ServiceConfig
- configItems = append(configItems, configParam)
- now := time.Now()
- timestampSec := now.Unix()
- str := strconv.FormatInt(timestampSec, 10)
- req := &modelarts.CreateServiceReq{
- Platform: m.platform,
- Config: configItems,
- InferType: "real-time",
- ServiceName: option.ModelName + "_" + option.ModelType + "_" + Npu + "_" + str,
- }
- ctx, cancel := context.WithTimeout(context.Background(), 150*time.Second)
- defer cancel()
- resp, err := m.modelArtsRpc.CreateService(ctx, req)
- if err != nil {
- return "", err
- }
-
- return resp.ServiceId, nil
- }
-
- func (m *ModelArtsLink) CheckModelExistence(ctx context.Context, name string, mtype string) bool {
- ifoption := &option.InferOption{
- ModelName: name,
- ModelType: mtype,
- }
- err := m.CheckImageExist(ctx, ifoption)
- if err != nil {
- return false
- }
-
- return true
- }
-
- func (m *ModelArtsLink) CheckImageExist(ctx context.Context, option *option.InferOption) error {
- req := &modelarts.ListImagesReq{
- Limit: m.pageSize,
- Offset: m.pageIndex,
- }
- ListImageResp, err := m.modelArtsRpc.ListImages(ctx, req)
- if err != nil {
- return err
- }
- var modelName string
- if ListImageResp.Code == 200 {
- //return errors.New("failed to get ModelId")
- for _, ListImage := range ListImageResp.Data {
- if option.ModelName == "ChatGLM-6B" {
- modelName = "chatglm-6b"
- } else {
- modelName = option.ModelName
- }
-
- if ListImage.Name == modelName {
- return nil
- }
- }
- }
- return errors.New("failed to find Image ")
- }
-
- func (m *ModelArtsLink) GetResourceSpecs(ctx context.Context, resrcType string) (*collector.ResourceSpec, error) {
- MoUsage := MoUsage{}
- var cpusum int64 = 0
- var npusum int64 = 0
- var memorysum int64 = 0
- var VMemorysum int64 = 0
- var RunningTaskNum int64 = 0
- var BalanceValue float64 = -1
- var RateValue = 0.930000
- var StorageValue int64 = 1024
- var AvailableValue int64 = 886
-
- resUsage := &collector.ResourceSpec{
- ClusterId: strconv.FormatInt(m.participantId, 10),
- }
-
- switch resrcType {
- case "Train":
- //查询获取训练作业支持的公共规格(包括1,2,4,8卡的选择和显存的数值)
- reqJobFlavors := &modelarts.TrainingJobFlavorsReq{
- Platform: m.platform,
- }
- respJobFlavors, err := m.modelArtsRpc.GetTrainingJobFlavors(ctx, reqJobFlavors)
- if err != nil {
- return nil, err
- }
- respJobFlavorsMarshal, err2 := json.Marshal(respJobFlavors)
- if err2 != nil {
-
- }
- println(string(respJobFlavorsMarshal))
-
- for _, TrainLists := range respJobFlavors.Flavors {
- re := regexp.MustCompile(`\d+`)
- VMemorynumberStr := re.FindString(string(TrainLists.FlavorInfo.Npu.Memory)) //显存的值,正则表达式去单位
- MoUsage.VMemorySize, err = strconv.ParseInt(VMemorynumberStr, 10, 64) //显存的值
-
- MoUsage.NpuSize = int64(TrainLists.FlavorInfo.Npu.UnitNum) //npu数量,张数
-
- MoUsage.CpuAvailable = int64(TrainLists.FlavorInfo.Cpu.CoreNum) //cpu核数
- MoUsage.MemoryAvailable = int64(TrainLists.FlavorInfo.Memory.Size) //内存大小
-
- npusum = MoUsage.NpuSize
- MoUsage.NpuAvailable = MoUsage.NpuSize
-
- cpusum = MoUsage.CpuAvailable
- memorysum = MoUsage.MemoryAvailable
- VMemorysum = MoUsage.VMemorySize
- MoUsage.VMemoryAvailable = MoUsage.VMemorySize
-
- str := fmt.Sprintf("%d", MoUsage.NpuSize) // 使用%d格式化占位符
- ASCENDName := str + "*ASCEND910"
-
- UsageCPU := &collector.Usage{Type: strings.ToUpper(CPU), Name: strings.ToUpper("ARM"), Total: &collector.UnitValue{Unit: CPUCORE, Value: cpusum}, Available: &collector.UnitValue{Unit: CPUCORE, Value: MoUsage.CpuAvailable}}
- UsageNPU := &collector.Usage{Type: strings.ToUpper(NPU), Name: ASCENDName, Total: &collector.UnitValue{Unit: NUMBER, Value: npusum}, Available: &collector.UnitValue{Unit: NUMBER, Value: MoUsage.NpuAvailable}}
- UsageMEMORY := &collector.Usage{Type: strings.ToUpper(MEMORY), Name: strings.ToUpper(RAM), Total: &collector.UnitValue{Unit: GIGABYTE, Value: memorysum}, Available: &collector.UnitValue{Unit: GIGABYTE, Value: MoUsage.MemoryAvailable}}
- UsageVMEMORY := &collector.Usage{Type: strings.ToUpper(MEMORY), Name: strings.ToUpper(VRAM), Total: &collector.UnitValue{Unit: GIGABYTE, Value: VMemorysum}, Available: &collector.UnitValue{Unit: GIGABYTE, Value: MoUsage.VMemoryAvailable}}
- Storage := &collector.Usage{Type: strings.ToUpper(STORAGE), Total: &collector.UnitValue{Unit: GIGABYTE, Value: StorageValue}, Name: strings.ToUpper("disk"), Available: &collector.UnitValue{Unit: GIGABYTE, Value: AvailableValue}}
-
- cres := &collector.ClusterResource{}
- cres.Resource = UsageNPU
- cres.BaseResources = append(cres.BaseResources, UsageCPU)
- cres.BaseResources = append(cres.BaseResources, UsageMEMORY)
- cres.BaseResources = append(cres.BaseResources, UsageVMEMORY)
- cres.BaseResources = append(cres.BaseResources, Storage)
-
- resUsage.Resources = append(resUsage.Resources, cres)
- }
-
- RunningTask := &collector.Usage{Type: strings.ToUpper(RUNNINGTASK), Total: &collector.UnitValue{Unit: NUMBER, Value: RunningTaskNum}}
- Balance := &collector.Usage{Type: strings.ToUpper(BALANCE), Total: &collector.UnitValue{Unit: RMB, Value: BalanceValue}}
- Rate := &collector.Usage{Type: strings.ToUpper(RATE), Total: &collector.UnitValue{Unit: PERHOUR, Value: RateValue}}
-
- RunningTaskRes := &collector.ClusterResource{}
- RunningTaskRes.Resource = RunningTask
-
- BalanceRes := &collector.ClusterResource{}
- BalanceRes.Resource = Balance
-
- RateRes := &collector.ClusterResource{}
- RateRes.Resource = Rate
-
- resUsage.Resources = append(resUsage.Resources, RunningTaskRes)
- resUsage.Resources = append(resUsage.Resources, BalanceRes)
- resUsage.Resources = append(resUsage.Resources, RateRes)
- resUsage.Tag = "Train"
-
- case "Inference":
- req := &modelarts.ListSpecificationsReq{
- //Platform: m.platform,
- IsPersonalCluster: true,
- InferType: "real-time",
- Limit: m.pageIndex,
- OffSet: m.pageSize,
- }
- ListSpecificationsResp, err := m.modelArtsRpc.ListSpecifications(ctx, req)
- if err != nil {
- return nil, err
- }
- respJobSpecificationsMarshal, err2 := json.Marshal(ListSpecificationsResp)
- if err2 != nil {
-
- }
- println(string(respJobSpecificationsMarshal))
-
- for _, Specifications := range ListSpecificationsResp.Specifications {
- if Specifications.SpecStatus == "normal" {
- MoUsage.VMemorySize = int64(Specifications.NpuInfo.Memory) //显存的值
-
- MoUsage.NpuSize = int64(Specifications.NpuInfo.Npu) //npu数量,张数
-
- MoUsage.CpuAvailable = int64(Specifications.CpuInfo.Cpu) //cpu核数
- MoUsage.MemoryAvailable = int64(Specifications.MemoryInfo.Memory) //内存大小
-
- npusum = MoUsage.NpuSize
- MoUsage.NpuAvailable = MoUsage.NpuSize
-
- cpusum = MoUsage.CpuAvailable
- memorysum = MoUsage.MemoryAvailable
- VMemorysum = MoUsage.VMemorySize
- MoUsage.VMemoryAvailable = MoUsage.VMemorySize
-
- ASCENDName := Specifications.DisplayCn + Specifications.Specification
-
- UsageCPU := &collector.Usage{Type: strings.ToUpper(CPU), Name: strings.ToUpper("ARM"), Total: &collector.UnitValue{Unit: CPUCORE, Value: cpusum}, Available: &collector.UnitValue{Unit: CPUCORE, Value: MoUsage.CpuAvailable}}
- UsageNPU := &collector.Usage{Type: strings.ToUpper(NPU), Name: ASCENDName, Total: &collector.UnitValue{Unit: NUMBER, Value: npusum}, Available: &collector.UnitValue{Unit: NUMBER, Value: MoUsage.NpuAvailable}}
- UsageMEMORY := &collector.Usage{Type: strings.ToUpper(MEMORY), Name: strings.ToUpper(RAM), Total: &collector.UnitValue{Unit: GIGABYTE, Value: memorysum}, Available: &collector.UnitValue{Unit: GIGABYTE, Value: MoUsage.MemoryAvailable}}
- UsageVMEMORY := &collector.Usage{Type: strings.ToUpper(MEMORY), Name: strings.ToUpper(VRAM), Total: &collector.UnitValue{Unit: GIGABYTE, Value: VMemorysum}, Available: &collector.UnitValue{Unit: GIGABYTE, Value: MoUsage.VMemoryAvailable}}
- Storage := &collector.Usage{Type: strings.ToUpper(STORAGE), Total: &collector.UnitValue{Unit: GIGABYTE, Value: StorageValue}, Name: strings.ToUpper("disk"), Available: &collector.UnitValue{Unit: GIGABYTE, Value: AvailableValue}}
-
- cres := &collector.ClusterResource{}
- cres.Resource = UsageNPU
- cres.BaseResources = append(cres.BaseResources, UsageCPU)
- cres.BaseResources = append(cres.BaseResources, UsageMEMORY)
- cres.BaseResources = append(cres.BaseResources, UsageVMEMORY)
- cres.BaseResources = append(cres.BaseResources, Storage)
- resUsage.Tag = "Inference"
- resUsage.Resources = append(resUsage.Resources, cres)
- }
- }
-
- RunningTask := &collector.Usage{Type: strings.ToUpper(RUNNINGTASK), Total: &collector.UnitValue{Unit: NUMBER, Value: RunningTaskNum}}
- Balance := &collector.Usage{Type: strings.ToUpper(BALANCE), Total: &collector.UnitValue{Unit: RMB, Value: BalanceValue}}
- Rate := &collector.Usage{Type: strings.ToUpper(RATE), Total: &collector.UnitValue{Unit: PERHOUR, Value: RateValue}}
-
- RunningTaskRes := &collector.ClusterResource{}
- RunningTaskRes.Resource = RunningTask
-
- BalanceRes := &collector.ClusterResource{}
- BalanceRes.Resource = Balance
-
- RateRes := &collector.ClusterResource{}
- RateRes.Resource = Rate
-
- resUsage.Resources = append(resUsage.Resources, RunningTaskRes)
- resUsage.Resources = append(resUsage.Resources, BalanceRes)
- resUsage.Resources = append(resUsage.Resources, RateRes)
-
- }
- return resUsage, nil
- }
-
- func (m *ModelArtsLink) Stop(ctx context.Context, id string) error {
- req := &modelarts.StopTrainingJobReq{
- TrainingJobId: id,
- ActionType: "terminate",
- }
- resp, err := m.modelArtsRpc.StopTrainingJob(ctx, req)
- if err != nil {
- return err
- }
- if resp.Code != 0 {
- return errors.New(resp.ErrorMsg)
- }
- return nil
- }
|