| @@ -50,9 +50,9 @@ k8s: | |||||
| # 展示Pod的CPU使用率,Memory使用量,GPU使用率的grafana地址 | # 展示Pod的CPU使用率,Memory使用量,GPU使用率的grafana地址 | ||||
| pod: | pod: | ||||
| metrics: | metrics: | ||||
| grafanaUrl: http://127.0.0.1:30006/d/job/monitor?orgId=1&refresh=5s&kiosk&var-pod= | |||||
| grafanaUrl: http://grafana.dubhe.ai:30006/d/job/monitor?orgId=1&refresh=5s&kiosk&var-pod= | |||||
| prometheus: | prometheus: | ||||
| url: http://127.0.0.1:30003/ | |||||
| url: http://10.5.26.91:30003/ | |||||
| query: api/v1/query | query: api/v1/query | ||||
| query-range: api/v1/query_range | query-range: api/v1/query_range | ||||
| gpu-query-param: sum(container_accelerator_duty_cycle{pod="pod-name-placeholder"})by(pod,acc_id) | gpu-query-param: sum(container_accelerator_duty_cycle{pod="pod-name-placeholder"})by(pod,acc_id) | ||||
| @@ -60,9 +60,16 @@ k8s: | |||||
| gpu-mem-use-query-param: sum(container_accelerator_memory_used_bytes{pod="pod-name-placeholder"})by(pod,acc_id) | gpu-mem-use-query-param: sum(container_accelerator_memory_used_bytes{pod="pod-name-placeholder"})by(pod,acc_id) | ||||
| cpu-range-query-param: sum(rate(container_cpu_usage_seconds_total{image!="",pod="pod-name-placeholder"}[1m])) by (pod) / (sum(container_spec_cpu_quota{image!=""}/100000) by (pod)) * 100 | cpu-range-query-param: sum(rate(container_cpu_usage_seconds_total{image!="",pod="pod-name-placeholder"}[1m])) by (pod) / (sum(container_spec_cpu_quota{image!=""}/100000) by (pod)) * 100 | ||||
| mem-range-query-param: sum(container_memory_rss{image!="",pod="pod-name-placeholder"}) | mem-range-query-param: sum(container_memory_rss{image!="",pod="pod-name-placeholder"}) | ||||
| gpu-usage-query-param: sum by(Hostname,gpu)(DCGM_FI_PROF_GR_ENGINE_ACTIVE{pod!=""}) | |||||
| gpu-range-query-param: sum(container_accelerator_duty_cycle{pod="pod-name-placeholder"}) by (pod,acc_id) | gpu-range-query-param: sum(container_accelerator_duty_cycle{pod="pod-name-placeholder"}) by (pod,acc_id) | ||||
| gpu-mem-total-range-query-param: sum(container_accelerator_memory_total_bytes{pod="pod-name-placeholder"}) by (pod,acc_id) | gpu-mem-total-range-query-param: sum(container_accelerator_memory_total_bytes{pod="pod-name-placeholder"}) by (pod,acc_id) | ||||
| gpu-mem-use-range-query-param: sum(container_accelerator_memory_used_bytes{pod="pod-name-placeholder"}) by (pod,acc_id) | gpu-mem-use-range-query-param: sum(container_accelerator_memory_used_bytes{pod="pod-name-placeholder"}) by (pod,acc_id) | ||||
| gpu-usage-rate-query-param: topk(10,sort_desc(max_over_time(namespace:DCGM_FI_PROF_GR_ENGINE_ACTIVE:sumn[usage-rate-day]))) | |||||
| cpu-usage-rate-query-param: topk(10,sort_desc(max_over_time(namespace:container_cpu_user_seconds_total_sumn:raten[usage-rate-day]))) | |||||
| mem-usage-rate-query-param: topk(10,sort_desc(max_over_time(namespace:CONTAINER_MEMERY_USAGE_BYTES:sumn{namespace=~"namespace.*"}[usage-rate-day]))) | |||||
| gpu-usage-namespace-query-param: max_over_time(namespace:DCGM_FI_PROF_GR_ENGINE_ACTIVE:sumn{namespace=~"namespace-placeholder"}[usage-rate-day]) | |||||
| cpu-usage-namespace-query-param: max_over_time(namespace:container_cpu_user_seconds_total_sumn:raten{namespace=~"namespace-placeholder"}[usage-rate-day]) | |||||
| mem-usage-namespace-query-param: max_over_time(namespace:CONTAINER_MEMERY_USAGE_BYTES:sumn{namespace=~"namespace-placeholder"}[usage-rate-day]) | |||||
| nfs-storage-class-name: zjlab-nfs-storage | nfs-storage-class-name: zjlab-nfs-storage | ||||
| namespace-limits: | namespace-limits: | ||||
| cpu: 10 | cpu: 10 | ||||