You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

promql.go 39 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486
  1. package tracker
  2. import (
  3. "fmt"
  4. "strings"
  5. )
  6. const (
  7. StatefulSet = "StatefulSet"
  8. DaemonSet = "DaemonSet"
  9. Deployment = "Deployment"
  10. )
  11. var promQLTemplates = map[string]string{
  12. //cluster
  13. "cluster_cpu_utilisation": ":node_cpu_utilisation:avg1m",
  14. "cluster_cpu_usage": `round(:node_cpu_utilisation:avg1m * sum(node:node_num_cpu:sum), 0.001)`,
  15. "cluster_cpu_total": "sum(node:node_num_cpu:sum)",
  16. "cluster_memory_utilisation": ":node_memory_utilisation:",
  17. "cluster_memory_available": "sum(node:node_memory_bytes_available:sum)",
  18. "cluster_memory_total": "sum(node:node_memory_bytes_total:sum)",
  19. "cluster_memory_usage_wo_cache": "sum(node:node_memory_bytes_total:sum) - sum(node:node_memory_bytes_available:sum)",
  20. "cluster_net_utilisation": ":node_net_utilisation:sum_irate",
  21. "cluster_net_bytes_transmitted": "sum(node:node_net_bytes_transmitted:sum_irate)",
  22. "cluster_net_bytes_received": "sum(node:node_net_bytes_received:sum_irate)",
  23. "cluster_disk_read_iops": "sum(node:data_volume_iops_reads:sum)",
  24. "cluster_disk_write_iops": "sum(node:data_volume_iops_writes:sum)",
  25. "cluster_disk_read_throughput": "sum(node:data_volume_throughput_bytes_read:sum)",
  26. "cluster_disk_write_throughput": "sum(node:data_volume_throughput_bytes_written:sum)",
  27. "cluster_disk_size_usage": `sum(max(node_filesystem_size_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"} - node_filesystem_avail_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"}) by (device, instance))`,
  28. "cluster_disk_size_utilisation": `cluster:disk_utilization:ratio`,
  29. "cluster_disk_size_capacity": `sum(max(node_filesystem_size_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"}) by (device, instance))`,
  30. "cluster_disk_size_available": `sum(max(node_filesystem_avail_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"}) by (device, instance))`,
  31. "cluster_disk_inode_total": `sum(node:node_inodes_total:)`,
  32. "cluster_disk_inode_usage": `sum(node:node_inodes_total:) - sum(node:node_inodes_free:)`,
  33. "cluster_disk_inode_utilisation": `cluster:disk_inode_utilization:ratio`,
  34. "cluster_namespace_count": `count(kube_namespace_labels)`,
  35. "cluster_pod_count": `cluster:pod:sum`,
  36. "cluster_pod_quota": `sum(max(kube_node_status_capacity{resource="pods"}) by (node) unless on (node) (kube_node_status_condition{condition="Ready",status=~"unknown|false"} > 0))`,
  37. "cluster_pod_utilisation": `cluster:pod_utilization:ratio`,
  38. "cluster_pod_running_count": `cluster:pod_running:count`,
  39. "cluster_pod_succeeded_count": `count(kube_pod_info unless on (pod) (kube_pod_status_phase{phase=~"Failed|Pending|Unknown|Running"} > 0) unless on (node) (kube_node_status_condition{condition="Ready",status=~"unknown|false"} > 0))`,
  40. "cluster_pod_abnormal_count": `cluster:pod_abnormal:sum`,
  41. "cluster_node_online": `sum(kube_node_status_condition{condition="Ready",status="true"})`,
  42. "cluster_node_offline": `cluster:node_offline:sum`,
  43. "cluster_node_total": `sum(kube_node_status_condition{condition="Ready"})`,
  44. "cluster_cronjob_count": `sum(kube_cronjob_labels)`,
  45. "cluster_pvc_count": `sum(kube_persistentvolumeclaim_info)`,
  46. "cluster_daemonset_count": `sum(kube_daemonset_labels)`,
  47. "cluster_deployment_count": `sum(kube_deployment_labels)`,
  48. "cluster_endpoint_count": `sum(kube_endpoint_labels)`,
  49. "cluster_hpa_count": `sum(kube_horizontalpodautoscaler_labels)`,
  50. "cluster_job_count": `sum(kube_job_labels)`,
  51. "cluster_statefulset_count": `sum(kube_statefulset_labels)`,
  52. "cluster_replicaset_count": `count(kube_replicaset_labels)`,
  53. "cluster_service_count": `sum(kube_service_info)`,
  54. "cluster_secret_count": `sum(kube_secret_info)`,
  55. "cluster_pv_count": `sum(kube_persistentvolume_labels)`,
  56. "cluster_ingresses_extensions_count": `sum(kube_ingress_labels)`,
  57. "cluster_load1": `sum(node_load1{job="node-exporter"}) / sum(node:node_num_cpu:sum)`,
  58. "cluster_load5": `sum(node_load5{job="node-exporter"}) / sum(node:node_num_cpu:sum)`,
  59. "cluster_load15": `sum(node_load15{job="node-exporter"}) / sum(node:node_num_cpu:sum)`,
  60. "cluster_pod_abnormal_ratio": `cluster:pod_abnormal:ratio`,
  61. "cluster_node_offline_ratio": `cluster:node_offline:ratio`,
  62. //node
  63. "node_cpu_utilisation": "node:node_cpu_utilisation:avg1m{$1}",
  64. "node_cpu_total": "node:node_num_cpu:sum{$1}",
  65. "node_memory_utilisation": "node:node_memory_utilisation:{$1}",
  66. "node_memory_available": "node:node_memory_bytes_available:sum{$1}",
  67. "node_memory_total": "node:node_memory_bytes_total:sum{$1}",
  68. "node_memory_usage_wo_cache": "node:node_memory_bytes_total:sum{$1} - node:node_memory_bytes_available:sum{$1}",
  69. "node_net_utilisation": "node:node_net_utilisation:sum_irate{$1}",
  70. "node_net_bytes_transmitted": "node:node_net_bytes_transmitted:sum_irate{$1}",
  71. "node_net_bytes_received": "node:node_net_bytes_received:sum_irate{$1}",
  72. "node_disk_read_iops": "node:data_volume_iops_reads:sum{$1}",
  73. "node_disk_write_iops": "node:data_volume_iops_writes:sum{$1}",
  74. "node_disk_read_throughput": "node:data_volume_throughput_bytes_read:sum{$1}",
  75. "node_disk_write_throughput": "node:data_volume_throughput_bytes_written:sum{$1}",
  76. "node_disk_size_capacity": `sum(max(node_filesystem_size_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"} * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{$1}) by (device, node)) by (node)`,
  77. "node_disk_size_available": `node:disk_space_available:{$1}`,
  78. "node_disk_size_usage": `sum(max((node_filesystem_size_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"} - node_filesystem_avail_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"}) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{$1}) by (device, node)) by (node)`,
  79. "node_disk_size_utilisation": `node:disk_space_utilization:ratio{$1}`,
  80. "node_disk_inode_total": `node:node_inodes_total:{$1}`,
  81. "node_disk_inode_usage": `node:node_inodes_total:{$1} - node:node_inodes_free:{$1}`,
  82. "node_disk_inode_utilisation": `node:disk_inode_utilization:ratio{$1}`,
  83. "node_pod_count": `node:pod_count:sum{$1}`,
  84. "node_pod_quota": `max(kube_node_status_capacity{resource="pods",$1}) by (node) unless on (node) (kube_node_status_condition{condition="Ready",status=~"unknown|false"} > 0)`,
  85. "node_pod_utilisation": `node:pod_utilization:ratio{$1}`,
  86. "node_pod_running_count": `node:pod_running:count{$1}`,
  87. "node_pod_succeeded_count": `node:pod_succeeded:count{$1}`,
  88. "node_pod_abnormal_count": `node:pod_abnormal:count{$1}`,
  89. "node_cpu_usage": `round(node:node_cpu_utilisation:avg1m{$1} * node:node_num_cpu:sum{$1}, 0.001)`,
  90. "node_load1": `node:load1:ratio{$1}`,
  91. "node_load5": `node:load5:ratio{$1}`,
  92. "node_load15": `node:load15:ratio{$1}`,
  93. "node_pod_abnormal_ratio": `node:pod_abnormal:ratio{$1}`,
  94. "node_pleg_quantile": `node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{$1}`,
  95. "node_device_size_usage": `sum by(device, node, host_ip, role) (node_filesystem_size_bytes{device!~"/dev/loop\\d+",device=~"/dev/.*",job="node-exporter"} * on(namespace, pod) group_left(node, host_ip, role) node_namespace_pod:kube_pod_info:{$1}) - sum by(device, node, host_ip, role) (node_filesystem_avail_bytes{device!~"/dev/loop\\d+",device=~"/dev/.*",job="node-exporter"} * on(namespace, pod) group_left(node, host_ip, role) node_namespace_pod:kube_pod_info:{$1})`,
  96. "node_device_size_utilisation": `1 - sum by(device, node, host_ip, role) (node_filesystem_avail_bytes{device!~"/dev/loop\\d+",device=~"/dev/.*",job="node-exporter"} * on(namespace, pod) group_left(node, host_ip, role) node_namespace_pod:kube_pod_info:{$1}) / sum by(device, node, host_ip, role) (node_filesystem_size_bytes{device!~"/dev/loop\\d+",device=~"/dev/.*",job="node-exporter"} * on(namespace, pod) group_left(node, host_ip, role) node_namespace_pod:kube_pod_info:{$1})`,
  97. // workspace
  98. "workspace_cpu_usage": `round(sum by (workspace) (namespace:container_cpu_usage_seconds_total:sum_rate{namespace!="", $1}), 0.001)`,
  99. "workspace_memory_usage": `sum by (workspace) (namespace:container_memory_usage_bytes:sum{namespace!="", $1})`,
  100. "workspace_memory_usage_wo_cache": `sum by (workspace) (namespace:container_memory_usage_bytes_wo_cache:sum{namespace!="", $1})`,
  101. "workspace_net_bytes_transmitted": `sum by (workspace) (sum by (namespace) (irate(container_network_transmit_bytes_total{namespace!="", pod!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m])) * on (namespace) group_left(workspace) kube_namespace_labels{$1}) or on(workspace) max by(workspace) (kube_namespace_labels{$1} * 0)`,
  102. "workspace_net_bytes_received": `sum by (workspace) (sum by (namespace) (irate(container_network_receive_bytes_total{namespace!="", pod!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m])) * on (namespace) group_left(workspace) kube_namespace_labels{$1}) or on(workspace) max by(workspace) (kube_namespace_labels{$1} * 0)`,
  103. "workspace_pod_count": `sum by (workspace) (kube_pod_status_phase{phase!~"Failed|Succeeded", namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1})) or on(workspace) max by(workspace) (kube_namespace_labels{$1} * 0)`,
  104. "workspace_pod_running_count": `sum by (workspace) (kube_pod_status_phase{phase="Running", namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1})) or on(workspace) max by(workspace) (kube_namespace_labels{$1} * 0)`,
  105. "workspace_pod_succeeded_count": `sum by (workspace) (kube_pod_status_phase{phase="Succeeded", namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1})) or on(workspace) max by(workspace) (kube_namespace_labels{$1} * 0)`,
  106. "workspace_pod_abnormal_count": `count by (workspace) ((kube_pod_info{node!=""} unless on (pod, namespace) (kube_pod_status_phase{job="kube-state-metrics", phase="Succeeded"}>0) unless on (pod, namespace) ((kube_pod_status_ready{job="kube-state-metrics", condition="true"}>0) and on (pod, namespace) (kube_pod_status_phase{job="kube-state-metrics", phase="Running"}>0)) unless on (pod, namespace) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", reason="ContainerCreating"}>0)) * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`,
  107. "workspace_ingresses_extensions_count": `sum by (workspace) (kube_ingress_labels{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`,
  108. "workspace_cronjob_count": `sum by (workspace) (kube_cronjob_labels{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`,
  109. "workspace_pvc_count": `sum by (workspace) (kube_persistentvolumeclaim_info{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`,
  110. "workspace_daemonset_count": `sum by (workspace) (kube_daemonset_labels{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`,
  111. "workspace_deployment_count": `sum by (workspace) (kube_deployment_labels{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`,
  112. "workspace_endpoint_count": `sum by (workspace) (kube_endpoint_labels{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`,
  113. "workspace_hpa_count": `sum by (workspace) (kube_horizontalpodautoscaler_labels{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`,
  114. "workspace_job_count": `sum by (workspace) (kube_job_labels{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`,
  115. "workspace_statefulset_count": `sum by (workspace) (kube_statefulset_labels{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`,
  116. "workspace_replicaset_count": `count by (workspace) (kube_replicaset_labels{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`,
  117. "workspace_service_count": `sum by (workspace) (kube_service_info{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`,
  118. "workspace_secret_count": `sum by (workspace) (kube_secret_info{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`,
  119. "workspace_pod_abnormal_ratio": `count by (workspace) ((kube_pod_info{node!=""} unless on (pod, namespace) (kube_pod_status_phase{job="kube-state-metrics", phase="Succeeded"}>0) unless on (pod, namespace) ((kube_pod_status_ready{job="kube-state-metrics", condition="true"}>0) and on (pod, namespace) (kube_pod_status_phase{job="kube-state-metrics", phase="Running"}>0)) unless on (pod, namespace) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", reason="ContainerCreating"}>0)) * on (namespace) group_left(workspace) kube_namespace_labels{$1}) / sum by (workspace) (kube_pod_status_phase{phase!="Succeeded", namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`,
  120. //namespace
  121. "namespace_cpu_usage": `round(namespace:container_cpu_usage_seconds_total:sum_rate{namespace!="", $1}, 0.001)`,
  122. "namespace_memory_usage": `namespace:container_memory_usage_bytes:sum{namespace!="", $1}`,
  123. "namespace_memory_usage_wo_cache": `namespace:container_memory_usage_bytes_wo_cache:sum{namespace!="", $1}`,
  124. "namespace_net_bytes_transmitted": `sum by (namespace) (irate(container_network_transmit_bytes_total{namespace!="", pod!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m]) * on (namespace) group_left(workspace) kube_namespace_labels{$1}) or on(namespace) max by(namespace) (kube_namespace_labels{$1} * 0)`,
  125. "namespace_net_bytes_received": `sum by (namespace) (irate(container_network_receive_bytes_total{namespace!="", pod!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m]) * on (namespace) group_left(workspace) kube_namespace_labels{$1}) or on(namespace) max by(namespace) (kube_namespace_labels{$1} * 0)`,
  126. "namespace_pod_count": `sum by (namespace) (kube_pod_status_phase{phase!~"Failed|Succeeded", namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1}) or on(namespace) max by(namespace) (kube_namespace_labels{$1} * 0)`,
  127. "namespace_pod_running_count": `sum by (namespace) (kube_pod_status_phase{phase="Running", namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1}) or on(namespace) max by(namespace) (kube_namespace_labels{$1} * 0)`,
  128. "namespace_pod_succeeded_count": `sum by (namespace) (kube_pod_status_phase{phase="Succeeded", namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1}) or on(namespace) max by(namespace) (kube_namespace_labels{$1} * 0)`,
  129. "namespace_pod_abnormal_count": `namespace:pod_abnormal:count{namespace!="", $1}`,
  130. "namespace_pod_abnormal_ratio": `namespace:pod_abnormal:ratio{namespace!="", $1}`,
  131. "namespace_memory_limit_hard": `min by (namespace) (kube_resourcequota{resourcequota!="quota", type="hard", namespace!="", resource="limits.memory"} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
  132. "namespace_cpu_limit_hard": `min by (namespace) (kube_resourcequota{resourcequota!="quota", type="hard", namespace!="", resource="limits.cpu"} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
  133. "namespace_pod_count_hard": `min by (namespace) (kube_resourcequota{resourcequota!="quota", type="hard", namespace!="", resource="count/pods"} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
  134. "namespace_cronjob_count": `sum by (namespace) (kube_cronjob_labels{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
  135. "namespace_pvc_count": `sum by (namespace) (kube_persistentvolumeclaim_info{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
  136. "namespace_daemonset_count": `sum by (namespace) (kube_daemonset_labels{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
  137. "namespace_deployment_count": `sum by (namespace) (kube_deployment_labels{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
  138. "namespace_endpoint_count": `sum by (namespace) (kube_endpoint_labels{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
  139. "namespace_hpa_count": `sum by (namespace) (kube_horizontalpodautoscaler_labels{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
  140. "namespace_job_count": `sum by (namespace) (kube_job_labels{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
  141. "namespace_statefulset_count": `sum by (namespace) (kube_statefulset_labels{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
  142. "namespace_replicaset_count": `count by (namespace) (kube_replicaset_labels{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
  143. "namespace_service_count": `sum by (namespace) (kube_service_info{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
  144. "namespace_secret_count": `sum by (namespace) (kube_secret_info{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
  145. "namespace_configmap_count": `sum by (namespace) (kube_configmap_info{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
  146. "namespace_ingresses_extensions_count": `sum by (namespace) (kube_ingress_labels{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
  147. "namespace_s2ibuilder_count": `sum by (namespace) (s2i_s2ibuilder_created{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
  148. // ingress
  149. "ingress_request_count": `round(sum(increase(nginx_ingress_controller_requests{$1,$2}[$3])))`,
  150. "ingress_request_4xx_count": `round(sum(increase(nginx_ingress_controller_requests{$1,$2,status=~"[4].*"}[$3])))`,
  151. "ingress_request_5xx_count": `round(sum(increase(nginx_ingress_controller_requests{$1,$2,status=~"[5].*"}[$3])))`,
  152. "ingress_active_connections": `sum(avg_over_time(nginx_ingress_controller_nginx_process_connections{$2,state="active"}[$3]))`,
  153. "ingress_success_rate": `sum(rate(nginx_ingress_controller_requests{$1,$2,status!~"[4-5].*"}[$3])) / sum(rate(nginx_ingress_controller_requests{$1,$2}[$3]))`,
  154. "ingress_request_duration_average": `sum_over_time(nginx_ingress_controller_request_duration_seconds_sum{$1,$2}[$3])/sum_over_time(nginx_ingress_controller_request_duration_seconds_count{$1,$2}[$3])`,
  155. "ingress_request_duration_50percentage": `histogram_quantile(0.50, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{$1,$2}[$3])))`,
  156. "ingress_request_duration_95percentage": `histogram_quantile(0.95, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{$1,$2}[$3])))`,
  157. "ingress_request_duration_99percentage": `histogram_quantile(0.99, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{$1,$2}[$3])))`,
  158. "ingress_request_volume": `round(sum(irate(nginx_ingress_controller_requests{$1,$2}[$3])), 0.001)`,
  159. "ingress_request_volume_by_ingress": `round(sum(irate(nginx_ingress_controller_requests{$1,$2}[$3])) by (ingress), 0.001)`,
  160. "ingress_request_network_sent": `sum(irate(nginx_ingress_controller_response_size_sum{$1,$2}[$3]))`,
  161. "ingress_request_network_received": `sum(irate(nginx_ingress_controller_request_size_sum{$1,$2}[$3]))`,
  162. "ingress_request_memory_bytes": `avg(nginx_ingress_controller_nginx_process_resident_memory_bytes{$2})`,
  163. "ingress_request_cpu_usage": `avg(rate(nginx_ingress_controller_nginx_process_cpu_seconds_total{$2}[5m]))`,
  164. // workload
  165. "workload_cpu_usage": `round(namespace:workload_cpu_usage:sum{$1}, 0.001)`,
  166. "workload_memory_usage": `namespace:workload_memory_usage:sum{$1}`,
  167. "workload_memory_usage_wo_cache": `namespace:workload_memory_usage_wo_cache:sum{$1}`,
  168. "workload_net_bytes_transmitted": `namespace:workload_net_bytes_transmitted:sum_irate{$1}`,
  169. "workload_net_bytes_received": `namespace:workload_net_bytes_received:sum_irate{$1}`,
  170. "workload_deployment_replica": `label_join(sum (label_join(label_replace(kube_deployment_spec_replicas{$2}, "owner_kind", "Deployment", "", ""), "workload", "", "deployment")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`,
  171. "workload_deployment_replica_available": `label_join(sum (label_join(label_replace(kube_deployment_status_replicas_available{$2}, "owner_kind", "Deployment", "", ""), "workload", "", "deployment")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`,
  172. "workload_statefulset_replica": `label_join(sum (label_join(label_replace(kube_statefulset_replicas{$2}, "owner_kind", "StatefulSet", "", ""), "workload", "", "statefulset")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`,
  173. "workload_statefulset_replica_available": `label_join(sum (label_join(label_replace(kube_statefulset_status_replicas_current{$2}, "owner_kind", "StatefulSet", "", ""), "workload", "", "statefulset")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`,
  174. "workload_daemonset_replica": `label_join(sum (label_join(label_replace(kube_daemonset_status_desired_number_scheduled{$2}, "owner_kind", "DaemonSet", "", ""), "workload", "", "daemonset")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`,
  175. "workload_daemonset_replica_available": `label_join(sum (label_join(label_replace(kube_daemonset_status_number_available{$2}, "owner_kind", "DaemonSet", "", ""), "workload", "", "daemonset")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`,
  176. "workload_deployment_unavailable_replicas_ratio": `namespace:deployment_unavailable_replicas:ratio{$1}`,
  177. "workload_daemonset_unavailable_replicas_ratio": `namespace:daemonset_unavailable_replicas:ratio{$1}`,
  178. "workload_statefulset_unavailable_replicas_ratio": `namespace:statefulset_unavailable_replicas:ratio{$1}`,
  179. // pod
  180. "pod_cpu_usage": `round(sum by (namespace, pod) (irate(container_cpu_usage_seconds_total{job="kubelet", pod!="", image!=""}[5m])) * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}, 0.001)`,
  181. "pod_memory_usage": `sum by (namespace, pod) (container_memory_usage_bytes{job="kubelet", pod!="", image!=""}) * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`,
  182. "pod_memory_usage_wo_cache": `sum by (namespace, pod) (container_memory_working_set_bytes{job="kubelet", pod!="", image!=""}) * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`,
  183. "pod_net_bytes_transmitted": `sum by (namespace, pod) (irate(container_network_transmit_bytes_total{pod!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m])) * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`,
  184. "pod_net_bytes_received": `sum by (namespace, pod) (irate(container_network_receive_bytes_total{pod!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m])) * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`,
  185. "pod_cpu_resource_limits": `sum by (namespace, pod) (kube_pod_container_resource_limits{origin_prometheus=~"",resource="cpu",unit="core"}) * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`,
  186. "pod_memory_resource_limits": `sum by (namespace, pod) (kube_pod_container_resource_limits{origin_prometheus=~"",resource="memory",unit="byte"}) * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`,
  187. // container
  188. "container_cpu_usage": `round(sum by (namespace, pod, container) (irate(container_cpu_usage_seconds_total{job="kubelet", container!="POD", container!="", image!="", $1}[5m])), 0.001)`,
  189. "container_memory_usage": `sum by (namespace, pod, container) (container_memory_usage_bytes{job="kubelet", container!="POD", container!="", image!="", $1})`,
  190. "container_memory_usage_wo_cache": `sum by (namespace, pod, container) (container_memory_working_set_bytes{job="kubelet", container!="POD", container!="", image!="", $1})`,
  191. "container_processes_usage": `sum by (namespace, pod, container) (container_processes{job="kubelet", container!="POD", container!="", image!="", $1})`,
  192. "container_threads_usage": `sum by (namespace, pod, container) (container_threads {job="kubelet", container!="POD", container!="", image!="", $1})`,
  193. // pvc
  194. "pvc_inodes_available": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes_free) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
  195. "pvc_inodes_used": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes_used) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
  196. "pvc_inodes_total": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
  197. "pvc_inodes_utilisation": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes_used / kubelet_volume_stats_inodes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
  198. "pvc_bytes_available": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_available_bytes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
  199. "pvc_bytes_used": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_used_bytes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
  200. "pvc_bytes_total": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
  201. "pvc_bytes_utilisation": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
  202. // component
  203. "etcd_server_list": `label_replace(up{job="etcd"}, "node_ip", "$1", "instance", "(.*):.*")`,
  204. "etcd_server_total": `count(up{job="etcd"})`,
  205. "etcd_server_up_total": `etcd:up:sum`,
  206. "etcd_server_has_leader": `label_replace(etcd_server_has_leader, "node_ip", "$1", "instance", "(.*):.*")`,
  207. "etcd_server_is_leader": `label_replace(etcd_server_is_leader, "node_ip", "$1", "instance", "(.*):.*")`,
  208. "etcd_server_leader_changes": `label_replace(etcd:etcd_server_leader_changes_seen:sum_changes, "node_ip", "$1", "node", "(.*)")`,
  209. "etcd_server_proposals_failed_rate": `avg(etcd:etcd_server_proposals_failed:sum_irate)`,
  210. "etcd_server_proposals_applied_rate": `avg(etcd:etcd_server_proposals_applied:sum_irate)`,
  211. "etcd_server_proposals_committed_rate": `avg(etcd:etcd_server_proposals_committed:sum_irate)`,
  212. "etcd_server_proposals_pending_count": `avg(etcd:etcd_server_proposals_pending:sum)`,
  213. "etcd_mvcc_db_size": `avg(etcd:etcd_mvcc_db_total_size:sum)`,
  214. "etcd_network_client_grpc_received_bytes": `sum(etcd:etcd_network_client_grpc_received_bytes:sum_irate)`,
  215. "etcd_network_client_grpc_sent_bytes": `sum(etcd:etcd_network_client_grpc_sent_bytes:sum_irate)`,
  216. "etcd_grpc_call_rate": `sum(etcd:grpc_server_started:sum_irate)`,
  217. "etcd_grpc_call_failed_rate": `sum(etcd:grpc_server_handled:sum_irate)`,
  218. "etcd_grpc_server_msg_received_rate": `sum(etcd:grpc_server_msg_received:sum_irate)`,
  219. "etcd_grpc_server_msg_sent_rate": `sum(etcd:grpc_server_msg_sent:sum_irate)`,
  220. "etcd_disk_wal_fsync_duration": `avg(etcd:etcd_disk_wal_fsync_duration:avg)`,
  221. "etcd_disk_wal_fsync_duration_quantile": `avg(etcd:etcd_disk_wal_fsync_duration:histogram_quantile) by (quantile)`,
  222. "etcd_disk_backend_commit_duration": `avg(etcd:etcd_disk_backend_commit_duration:avg)`,
  223. "etcd_disk_backend_commit_duration_quantile": `avg(etcd:etcd_disk_backend_commit_duration:histogram_quantile) by (quantile)`,
  224. "apiserver_up_sum": `apiserver:up:sum`,
  225. "apiserver_request_rate": `apiserver:apiserver_request_total:sum_irate`,
  226. "apiserver_request_by_verb_rate": `apiserver:apiserver_request_total:sum_verb_irate`,
  227. "apiserver_request_latencies": `apiserver:apiserver_request_duration:avg`,
  228. "apiserver_request_by_verb_latencies": `apiserver:apiserver_request_duration:avg_by_verb`,
  229. "scheduler_up_sum": `scheduler:up:sum`,
  230. "scheduler_schedule_attempts": `scheduler:scheduler_schedule_attempts:sum`,
  231. "scheduler_schedule_attempt_rate": `scheduler:scheduler_schedule_attempts:sum_rate`,
  232. "scheduler_e2e_scheduling_latency": `scheduler:scheduler_e2e_scheduling_duration:avg`,
  233. "scheduler_e2e_scheduling_latency_quantile": `scheduler:scheduler_e2e_scheduling_duration:histogram_quantile`,
  234. }
  235. func makeExpr(metric string, opts QueryOptions) string {
  236. tmpl := promQLTemplates[metric]
  237. switch opts.Level {
  238. case LevelCluster:
  239. return tmpl
  240. case LevelNode:
  241. return makeNodeMetricExpr(tmpl, opts)
  242. case LevelWorkspace:
  243. return makeWorkspaceMetricExpr(tmpl, opts)
  244. case LevelNamespace:
  245. return makeNamespaceMetricExpr(tmpl, opts)
  246. case LevelWorkload:
  247. return makeWorkloadMetricExpr(metric, tmpl, opts)
  248. case LevelPod:
  249. return makePodMetricExpr(tmpl, opts)
  250. case LevelContainer:
  251. return makeContainerMetricExpr(tmpl, opts)
  252. case LevelPVC:
  253. return makePVCMetricExpr(tmpl, opts)
  254. case LevelIngress:
  255. return makeIngressMetricExpr(tmpl, opts)
  256. case LevelComponent:
  257. return tmpl
  258. default:
  259. return tmpl
  260. }
  261. }
  262. func makeNodeMetricExpr(tmpl string, o QueryOptions) string {
  263. var nodeSelector string
  264. if o.NodeName != "" {
  265. nodeSelector = fmt.Sprintf(`node="%s"`, o.NodeName)
  266. } else {
  267. nodeSelector = fmt.Sprintf(`node=~"%s"`, o.ResourceFilter)
  268. }
  269. return strings.Replace(tmpl, "$1", nodeSelector, -1)
  270. }
  271. func makeWorkspaceMetricExpr(tmpl string, o QueryOptions) string {
  272. var workspaceSelector string
  273. if o.WorkspaceName != "" {
  274. workspaceSelector = fmt.Sprintf(`workspace="%s"`, o.WorkspaceName)
  275. } else {
  276. workspaceSelector = fmt.Sprintf(`workspace=~"%s", workspace!=""`, o.ResourceFilter)
  277. }
  278. return strings.Replace(tmpl, "$1", workspaceSelector, -1)
  279. }
  280. func makeNamespaceMetricExpr(tmpl string, o QueryOptions) string {
  281. var namespaceSelector string
  282. // For monitoring namespaces in the specific workspace
  283. // GET /workspaces/{workspace}/namespaces
  284. if o.WorkspaceName != "" {
  285. namespaceSelector = fmt.Sprintf(`workspace="%s", namespace=~"%s"`, o.WorkspaceName, o.ResourceFilter)
  286. return strings.Replace(tmpl, "$1", namespaceSelector, -1)
  287. }
  288. // For monitoring the specific namespaces
  289. // GET /namespaces/{namespace} or
  290. // GET /namespaces
  291. if o.NamespaceName != "" {
  292. namespaceSelector = fmt.Sprintf(`namespace="%s"`, o.NamespaceName)
  293. } else {
  294. namespaceSelector = fmt.Sprintf(`namespace=~"%s"`, o.ResourceFilter)
  295. }
  296. return strings.Replace(tmpl, "$1", namespaceSelector, -1)
  297. }
  298. func makeWorkloadMetricExpr(metric, tmpl string, o QueryOptions) string {
  299. var kindSelector, workloadSelector string
  300. switch o.WorkloadKind {
  301. case "deployment":
  302. o.WorkloadKind = Deployment
  303. case "statefulset":
  304. o.WorkloadKind = StatefulSet
  305. case "daemonset":
  306. o.WorkloadKind = DaemonSet
  307. default:
  308. o.WorkloadKind = ".*"
  309. }
  310. workloadSelector = fmt.Sprintf(`namespace="%s", workload=~"%s:(%s)"`, o.NamespaceName, o.WorkloadKind, o.ResourceFilter)
  311. if strings.Contains(metric, "deployment") {
  312. kindSelector = fmt.Sprintf(`namespace="%s", deployment!="", deployment=~"%s"`, o.NamespaceName, o.ResourceFilter)
  313. }
  314. if strings.Contains(metric, "statefulset") {
  315. kindSelector = fmt.Sprintf(`namespace="%s", statefulset!="", statefulset=~"%s"`, o.NamespaceName, o.ResourceFilter)
  316. }
  317. if strings.Contains(metric, "daemonset") {
  318. kindSelector = fmt.Sprintf(`namespace="%s", daemonset!="", daemonset=~"%s"`, o.NamespaceName, o.ResourceFilter)
  319. }
  320. return strings.NewReplacer("$1", workloadSelector, "$2", kindSelector).Replace(tmpl)
  321. }
  322. func makePodMetricExpr(tmpl string, o QueryOptions) string {
  323. var podSelector, workloadSelector string
  324. // For monitoriong pods of the specific workload
  325. // GET /namespaces/{namespace}/workloads/{kind}/{workload}/pods
  326. if o.WorkloadName != "" {
  327. switch o.WorkloadKind {
  328. case "deployment":
  329. workloadSelector = fmt.Sprintf(`owner_kind="ReplicaSet", owner_name=~"^%s-[^-]{1,10}$"`, o.WorkloadName)
  330. case "statefulset":
  331. workloadSelector = fmt.Sprintf(`owner_kind="StatefulSet", owner_name="%s"`, o.WorkloadName)
  332. case "daemonset":
  333. workloadSelector = fmt.Sprintf(`owner_kind="DaemonSet", owner_name="%s"`, o.WorkloadName)
  334. }
  335. }
  336. // For monitoring pods in the specific namespace
  337. // GET /namespaces/{namespace}/workloads/{kind}/{workload}/pods or
  338. // GET /namespaces/{namespace}/pods/{pod} or
  339. // GET /namespaces/{namespace}/pods
  340. if o.NamespaceName != "" {
  341. if o.PodName != "" {
  342. podSelector = fmt.Sprintf(`pod="%s", namespace="%s"`, o.PodName, o.NamespaceName)
  343. } else {
  344. podSelector = fmt.Sprintf(`pod=~"%s", namespace="%s"`, o.ResourceFilter, o.NamespaceName)
  345. }
  346. } else {
  347. var namespaces, pods []string
  348. if o.NamespacedResourcesFilter != "" {
  349. for _, np := range strings.Split(o.NamespacedResourcesFilter, "|") {
  350. if nparr := strings.SplitN(np, "/", 2); len(nparr) > 1 {
  351. namespaces = append(namespaces, nparr[0])
  352. pods = append(pods, nparr[1])
  353. } else {
  354. pods = append(pods, np)
  355. }
  356. }
  357. }
  358. // For monitoring pods on the specific node
  359. // GET /nodes/{node}/pods/{pod}
  360. // GET /nodes/{node}/pods
  361. if o.NodeName != "" {
  362. if o.PodName != "" {
  363. if nparr := strings.SplitN(o.PodName, "/", 2); len(nparr) > 1 {
  364. podSelector = fmt.Sprintf(`namespace="%s",pod="%s", node="%s"`, nparr[0], nparr[1], o.NodeName)
  365. } else {
  366. podSelector = fmt.Sprintf(`pod="%s", node="%s"`, o.PodName, o.NodeName)
  367. }
  368. } else {
  369. var ps []string
  370. ps = append(ps, fmt.Sprintf(`node="%s"`, o.NodeName))
  371. if o.ResourceFilter != "" {
  372. ps = append(ps, fmt.Sprintf(`pod=~"%s"`, o.ResourceFilter))
  373. }
  374. if len(namespaces) > 0 {
  375. ps = append(ps, fmt.Sprintf(`namespace=~"%s"`, strings.Join(namespaces, "|")))
  376. }
  377. if len(pods) > 0 {
  378. ps = append(ps, fmt.Sprintf(`pod=~"%s"`, strings.Join(pods, "|")))
  379. }
  380. podSelector = strings.Join(ps, ",")
  381. }
  382. } else {
  383. // For monitoring pods in the whole cluster
  384. // Get /pods
  385. var ps []string
  386. if len(namespaces) > 0 {
  387. ps = append(ps, fmt.Sprintf(`namespace=~"%s"`, strings.Join(namespaces, "|")))
  388. }
  389. if len(pods) > 0 {
  390. ps = append(ps, fmt.Sprintf(`pod=~"%s"`, strings.Join(pods, "|")))
  391. }
  392. if len(ps) > 0 {
  393. podSelector = strings.Join(ps, ",")
  394. }
  395. }
  396. }
  397. return strings.NewReplacer("$1", workloadSelector, "$2", podSelector).Replace(tmpl)
  398. }
  399. func makeContainerMetricExpr(tmpl string, o QueryOptions) string {
  400. var containerSelector string
  401. if o.ContainerName != "" {
  402. containerSelector = fmt.Sprintf(`pod="%s", namespace="%s", container="%s"`, o.PodName, o.NamespaceName, o.ContainerName)
  403. } else {
  404. containerSelector = fmt.Sprintf(`pod="%s", namespace="%s", container=~"%s"`, o.PodName, o.NamespaceName, o.ResourceFilter)
  405. }
  406. return strings.Replace(tmpl, "$1", containerSelector, -1)
  407. }
  408. func makePVCMetricExpr(tmpl string, o QueryOptions) string {
  409. var pvcSelector string
  410. // For monitoring persistentvolumeclaims in the specific namespace
  411. // GET /namespaces/{namespace}/persistentvolumeclaims/{persistentvolumeclaim} or
  412. // GET /namespaces/{namespace}/persistentvolumeclaims
  413. if o.NamespaceName != "" {
  414. if o.PersistentVolumeClaimName != "" {
  415. pvcSelector = fmt.Sprintf(`namespace="%s", persistentvolumeclaim="%s"`, o.NamespaceName, o.PersistentVolumeClaimName)
  416. } else {
  417. pvcSelector = fmt.Sprintf(`namespace="%s", persistentvolumeclaim=~"%s"`, o.NamespaceName, o.ResourceFilter)
  418. }
  419. return strings.Replace(tmpl, "$1", pvcSelector, -1)
  420. }
  421. // For monitoring persistentvolumeclaims of the specific storageclass
  422. // GET /storageclasses/{storageclass}/persistentvolumeclaims
  423. if o.StorageClassName != "" {
  424. pvcSelector = fmt.Sprintf(`storageclass="%s", persistentvolumeclaim=~"%s"`, o.StorageClassName, o.ResourceFilter)
  425. }
  426. return strings.Replace(tmpl, "$1", pvcSelector, -1)
  427. }
  428. func makeIngressMetricExpr(tmpl string, o QueryOptions) string {
  429. var ingressSelector string
  430. var jobSelector string
  431. duration := "5m"
  432. // parse Range Vector Selectors metric{key=value}[duration]
  433. if o.Duration != nil {
  434. duration = o.Duration.String()
  435. }
  436. // job is a reqiuried filter
  437. // GET /namespaces/{namespace}/ingress?job=xxx&pod=xxx
  438. if o.Job != "" {
  439. jobSelector = fmt.Sprintf(`job="%s"`, o.Job)
  440. if o.PodName != "" {
  441. jobSelector = fmt.Sprintf(`%s,controller_pod="%s"`, jobSelector, o.PodName)
  442. }
  443. }
  444. tmpl = strings.Replace(tmpl, "$1", ingressSelector, -1)
  445. tmpl = strings.Replace(tmpl, "$2", jobSelector, -1)
  446. return strings.Replace(tmpl, "$3", duration, -1)
  447. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.