diff --git a/internal/storeLink/modelarts.go b/internal/storeLink/modelarts.go index 8cddcb40..3f9d70cb 100644 --- a/internal/storeLink/modelarts.go +++ b/internal/storeLink/modelarts.go @@ -930,6 +930,7 @@ func (m *ModelArtsLink) CheckImageExist(ctx context.Context, option *option.Infe func (m *ModelArtsLink) GetResourceSpecs(ctx context.Context) (*collector.ResourceSpec, error) { var wg sync.WaitGroup + //查询modelarts资源规格 req := &modelarts.GetResourceFlavorsReq{} resp, err := m.modelArtsRpc.GetResourceFlavors(ctx, req) if err != nil { @@ -947,46 +948,49 @@ func (m *ModelArtsLink) GetResourceSpecs(ctx context.Context) (*collector.Resour var BalanceValue float64 = -1 var RateValue float64 = 0.930000 var StorageValue int64 = 1024 + var AvailableValue int64 = 886 for _, Flavors := range resp.Items { if Flavors.Metadata.Name == "modelarts.kat1.8xlarge" { MoUsage.CpuSize, err = strconv.ParseInt(Flavors.Spec.Cpu, 10, 64) //CPU的值 if err != nil { // 如果转换失败,处理错误 fmt.Println("转换错误:", err) + return nil, err } cpusum = MoUsage.CpuSize MoUsage.NpuSize, err = strconv.ParseInt(Flavors.Spec.Npu.Size, 10, 64) //NPU的值 if err != nil { // 如果转换失败,处理错误 fmt.Println("转换错误:", err) + return nil, err } npusum = MoUsage.NpuSize re := regexp.MustCompile(`\d+`) - numberStr := re.FindString(Flavors.Spec.Memory) - MoUsage.MemorySize, err = strconv.ParseInt(numberStr, 10, 64) - //MoUsage.MemorySize, err = strconv.ParseInt(Flavors.Spec.Memory, 10, 64) //Memory的值 + numberStr := re.FindString(Flavors.Spec.Memory) //正则表达式去单位 + MoUsage.MemorySize, err = strconv.ParseInt(numberStr, 10, 64) //内存的值 if err != nil { // 如果转换失败,处理错误 fmt.Println("转换错误:", err) + return nil, err } memorysum = MoUsage.MemorySize * 1024 } } - //查询获取训练作业支持的公共规格 + //查询获取训练作业支持的公共规格(包括1,2,4,8卡的选择和显存的数值) reqJobFlavors := &modelarts.TrainingJobFlavorsReq{ Platform: m.platform, } respJobFlavors, err := m.modelArtsRpc.GetTrainingJobFlavors(ctx, reqJobFlavors) if err != nil { wg.Done() - + return nil, err } for _, TrainLists := range respJobFlavors.Flavors { if TrainLists.FlavorId == "modelarts.kat1.8xlarge" { re := regexp.MustCompile(`\d+`) - numberStr := re.FindString(string(TrainLists.FlavorInfo.Npu.Memory)) - MoUsage.VMemorySize, err = strconv.ParseInt(numberStr, 10, 64) //NPU的值 + numberStr := re.FindString(string(TrainLists.FlavorInfo.Npu.Memory)) //正则表达式去单位 + MoUsage.VMemorySize, err = strconv.ParseInt(numberStr, 10, 64) //显存的值 VMemorysum = MoUsage.VMemorySize * int64(TrainLists.FlavorInfo.Npu.UnitNum) } } @@ -999,7 +1003,7 @@ func (m *ModelArtsLink) GetResourceSpecs(ctx context.Context) (*collector.Resour respList, err := m.modelArtsRpc.GetListTrainingJobs(ctx, reqTraining) if err != nil { wg.Done() - + return nil, err } var CoreNum int32 = 0 var NpuNum int32 = 0 @@ -1030,7 +1034,7 @@ func (m *ModelArtsLink) GetResourceSpecs(ctx context.Context) (*collector.Resour RunningTask := &collector.Usage{Type: strings.ToUpper(RUNNINGTASK), Total: &collector.UnitValue{Unit: NUMBER, Value: RunningTaskNum}} Balance := &collector.Usage{Type: strings.ToUpper(BALANCE), Total: &collector.UnitValue{Unit: RMB, Value: BalanceValue}} Rate := &collector.Usage{Type: strings.ToUpper(RATE), Total: &collector.UnitValue{Unit: PERHOUR, Value: RateValue}} - Storage := &collector.Usage{Type: strings.ToUpper(STORAGE), Total: &collector.UnitValue{Unit: GIGABYTE, Value: StorageValue}} + Storage := &collector.Usage{Type: strings.ToUpper(STORAGE), Total: &collector.UnitValue{Unit: GIGABYTE, Value: StorageValue}, Name: strings.ToUpper(DISK), Available: &collector.UnitValue{Unit: GIGABYTE, Value: AvailableValue}} resUsage := &collector.ResourceSpec{ ClusterId: strconv.FormatInt(m.participantId, 10),