| @@ -930,6 +930,7 @@ func (m *ModelArtsLink) CheckImageExist(ctx context.Context, option *option.Infe | |||
| func (m *ModelArtsLink) GetResourceSpecs(ctx context.Context) (*collector.ResourceSpec, error) { | |||
| var wg sync.WaitGroup | |||
| //查询modelarts资源规格 | |||
| req := &modelarts.GetResourceFlavorsReq{} | |||
| resp, err := m.modelArtsRpc.GetResourceFlavors(ctx, req) | |||
| if err != nil { | |||
| @@ -947,46 +948,49 @@ func (m *ModelArtsLink) GetResourceSpecs(ctx context.Context) (*collector.Resour | |||
| var BalanceValue float64 = -1 | |||
| var RateValue float64 = 0.930000 | |||
| var StorageValue int64 = 1024 | |||
| var AvailableValue int64 = 886 | |||
| for _, Flavors := range resp.Items { | |||
| if Flavors.Metadata.Name == "modelarts.kat1.8xlarge" { | |||
| MoUsage.CpuSize, err = strconv.ParseInt(Flavors.Spec.Cpu, 10, 64) //CPU的值 | |||
| if err != nil { | |||
| // 如果转换失败,处理错误 | |||
| fmt.Println("转换错误:", err) | |||
| return nil, err | |||
| } | |||
| cpusum = MoUsage.CpuSize | |||
| MoUsage.NpuSize, err = strconv.ParseInt(Flavors.Spec.Npu.Size, 10, 64) //NPU的值 | |||
| if err != nil { | |||
| // 如果转换失败,处理错误 | |||
| fmt.Println("转换错误:", err) | |||
| return nil, err | |||
| } | |||
| npusum = MoUsage.NpuSize | |||
| re := regexp.MustCompile(`\d+`) | |||
| numberStr := re.FindString(Flavors.Spec.Memory) | |||
| MoUsage.MemorySize, err = strconv.ParseInt(numberStr, 10, 64) | |||
| //MoUsage.MemorySize, err = strconv.ParseInt(Flavors.Spec.Memory, 10, 64) //Memory的值 | |||
| numberStr := re.FindString(Flavors.Spec.Memory) //正则表达式去单位 | |||
| MoUsage.MemorySize, err = strconv.ParseInt(numberStr, 10, 64) //内存的值 | |||
| if err != nil { | |||
| // 如果转换失败,处理错误 | |||
| fmt.Println("转换错误:", err) | |||
| return nil, err | |||
| } | |||
| memorysum = MoUsage.MemorySize * 1024 | |||
| } | |||
| } | |||
| //查询获取训练作业支持的公共规格 | |||
| //查询获取训练作业支持的公共规格(包括1,2,4,8卡的选择和显存的数值) | |||
| reqJobFlavors := &modelarts.TrainingJobFlavorsReq{ | |||
| Platform: m.platform, | |||
| } | |||
| respJobFlavors, err := m.modelArtsRpc.GetTrainingJobFlavors(ctx, reqJobFlavors) | |||
| if err != nil { | |||
| wg.Done() | |||
| return nil, err | |||
| } | |||
| for _, TrainLists := range respJobFlavors.Flavors { | |||
| if TrainLists.FlavorId == "modelarts.kat1.8xlarge" { | |||
| re := regexp.MustCompile(`\d+`) | |||
| numberStr := re.FindString(string(TrainLists.FlavorInfo.Npu.Memory)) | |||
| MoUsage.VMemorySize, err = strconv.ParseInt(numberStr, 10, 64) //NPU的值 | |||
| numberStr := re.FindString(string(TrainLists.FlavorInfo.Npu.Memory)) //正则表达式去单位 | |||
| MoUsage.VMemorySize, err = strconv.ParseInt(numberStr, 10, 64) //显存的值 | |||
| VMemorysum = MoUsage.VMemorySize * int64(TrainLists.FlavorInfo.Npu.UnitNum) | |||
| } | |||
| } | |||
| @@ -999,7 +1003,7 @@ func (m *ModelArtsLink) GetResourceSpecs(ctx context.Context) (*collector.Resour | |||
| respList, err := m.modelArtsRpc.GetListTrainingJobs(ctx, reqTraining) | |||
| if err != nil { | |||
| wg.Done() | |||
| return nil, err | |||
| } | |||
| var CoreNum int32 = 0 | |||
| var NpuNum int32 = 0 | |||
| @@ -1030,7 +1034,7 @@ func (m *ModelArtsLink) GetResourceSpecs(ctx context.Context) (*collector.Resour | |||
| RunningTask := &collector.Usage{Type: strings.ToUpper(RUNNINGTASK), Total: &collector.UnitValue{Unit: NUMBER, Value: RunningTaskNum}} | |||
| Balance := &collector.Usage{Type: strings.ToUpper(BALANCE), Total: &collector.UnitValue{Unit: RMB, Value: BalanceValue}} | |||
| Rate := &collector.Usage{Type: strings.ToUpper(RATE), Total: &collector.UnitValue{Unit: PERHOUR, Value: RateValue}} | |||
| Storage := &collector.Usage{Type: strings.ToUpper(STORAGE), Total: &collector.UnitValue{Unit: GIGABYTE, Value: StorageValue}} | |||
| Storage := &collector.Usage{Type: strings.ToUpper(STORAGE), Total: &collector.UnitValue{Unit: GIGABYTE, Value: StorageValue}, Name: strings.ToUpper(DISK), Available: &collector.UnitValue{Unit: GIGABYTE, Value: AvailableValue}} | |||
| resUsage := &collector.ResourceSpec{ | |||
| ClusterId: strconv.FormatInt(m.participantId, 10), | |||