Files
k3s-cluster/monitoring/grafana-dashboard-pods.yaml
Roger Oriol bf1387dc3e monitoring: add Grafana dashboards + kube-state-metrics & node-exporter
Dashboards (provisioned via ConfigMaps into Grafana pod, 'K3s Cluster' folder):
- Cluster Overview: per-namespace CPU/mem/net/fs, pod counts, pod health (KSM)
- Pods & Services: per-pod CPU/mem/net/fs, throttling, pod status, restarts, PVCs
- Nodes: per-node CPU%/mem%, load average, disk usage, network (node-exporter)
- Control Plane & API Server: request rate, latency p95, 5xx, kubelet/PLEG
- Prometheus Self-Monitoring: ingestion, series, scrape duration, memory

Exporters (auto-scraped via existing kubernetes-service-endpoints job):
- kube-state-metrics: pod/deployment/PVC/replica state (kube_pod_status_phase,
  kube_pod_container_status_restarts_total, kube_persistentvolumeclaim_*)
- node-exporter (DaemonSet, hostNetwork): node_cpu_seconds_total,
  node_memory_*, node_filesystem_*, node_load*, node_network_*
2026-06-26 19:48:17 +02:00

313 lines
15 KiB
YAML

apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-pods
namespace: monitoring
labels:
app: grafana
grafana_dashboard: "1"
data:
pods.json: |
{
"annotations": {"list": []},
"editable": true,
"graphTooltip": 1,
"id": null,
"links": [],
"liveNow": false,
"panels": [
{
"datasource": {"type": "prometheus", "uid": "Prometheus"},
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {
"drawStyle": "line", "fillOpacity": 10, "lineInterpolation": "linear", "lineWidth": 1, "showPoints": "never", "spanNulls": true,
"stacking": {"group": "A", "mode": "normal"}, "thresholdsStyle": {"mode": "off"}
},
"mappings": [],
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
"unit": "core"
},
"overrides": []
},
"gridPos": {"h": 9, "w": 24, "x": 0, "y": 0},
"id": 1,
"options": {
"legend": {"calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true},
"tooltip": {"mode": "multi", "sort": "desc"}
},
"targets": [{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "sum(rate(container_cpu_usage_seconds_total{container!=\"\",container!=\"POD\",namespace=~\"$namespace\"}[5m])) by (pod)", "legendFormat": "{{pod}}", "refId": "A"}],
"title": "CPU Usage per Pod",
"type": "timeseries"
},
{
"datasource": {"type": "prometheus", "uid": "Prometheus"},
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {
"drawStyle": "line", "fillOpacity": 10, "lineInterpolation": "linear", "lineWidth": 1, "showPoints": "never", "spanNulls": true,
"stacking": {"group": "A", "mode": "normal"}, "thresholdsStyle": {"mode": "off"}
},
"mappings": [],
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
"unit": "bytes"
},
"overrides": []
},
"gridPos": {"h": 9, "w": 24, "x": 0, "y": 9},
"id": 2,
"options": {
"legend": {"calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true},
"tooltip": {"mode": "multi", "sort": "desc"}
},
"targets": [{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "sum(container_memory_working_set_bytes{container!=\"\",container!=\"POD\",namespace=~\"$namespace\"}) by (pod)", "legendFormat": "{{pod}}", "refId": "A"}],
"title": "Memory Usage per Pod",
"type": "timeseries"
},
{
"datasource": {"type": "prometheus", "uid": "Prometheus"},
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {
"drawStyle": "line", "fillOpacity": 10, "lineInterpolation": "linear", "lineWidth": 1, "showPoints": "never", "spanNulls": true,
"stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "off"}
},
"mappings": [],
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
"unit": "Bps"
},
"overrides": []
},
"gridPos": {"h": 9, "w": 12, "x": 0, "y": 18},
"id": 3,
"options": {
"legend": {"calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true},
"tooltip": {"mode": "multi", "sort": "desc"}
},
"targets": [
{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "sum(rate(container_network_receive_bytes_total{namespace=~\"$namespace\"}[5m])) by (pod)", "legendFormat": "RX {{pod}}", "refId": "A"}
],
"title": "Network RX per Pod",
"type": "timeseries"
},
{
"datasource": {"type": "prometheus", "uid": "Prometheus"},
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {
"drawStyle": "line", "fillOpacity": 10, "lineInterpolation": "linear", "lineWidth": 1, "showPoints": "never", "spanNulls": true,
"stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "off"}
},
"mappings": [],
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
"unit": "Bps"
},
"overrides": []
},
"gridPos": {"h": 9, "w": 12, "x": 12, "y": 18},
"id": 4,
"options": {
"legend": {"calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true},
"tooltip": {"mode": "multi", "sort": "desc"}
},
"targets": [
{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "sum(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\"}[5m])) by (pod)", "legendFormat": "TX {{pod}}", "refId": "A"}
],
"title": "Network TX per Pod",
"type": "timeseries"
},
{
"datasource": {"type": "prometheus", "uid": "Prometheus"},
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {
"drawStyle": "line", "fillOpacity": 10, "lineInterpolation": "linear", "lineWidth": 1, "showPoints": "never", "spanNulls": true,
"stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "off"}
},
"mappings": [],
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
"unit": "bytes"
},
"overrides": []
},
"gridPos": {"h": 9, "w": 12, "x": 0, "y": 27},
"id": 5,
"options": {
"legend": {"calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true},
"tooltip": {"mode": "multi", "sort": "desc"}
},
"targets": [{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "sum(container_fs_usage_bytes{namespace=~\"$namespace\"}) by (pod)", "legendFormat": "{{pod}}", "refId": "A"}],
"title": "Filesystem Usage per Pod",
"type": "timeseries"
},
{
"datasource": {"type": "prometheus", "uid": "Prometheus"},
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {
"drawStyle": "line", "fillOpacity": 10, "lineInterpolation": "linear", "lineWidth": 1, "showPoints": "never", "spanNulls": true,
"stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "off"}
},
"mappings": [],
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
"unit": "percent"
},
"overrides": []
},
"gridPos": {"h": 9, "w": 12, "x": 12, "y": 27},
"id": 6,
"options": {
"legend": {"calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true},
"tooltip": {"mode": "multi", "sort": "desc"}
},
"targets": [{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "sum(rate(container_cpu_cfs_throttled_seconds_total{namespace=~\"$namespace\"}[5m])) by (pod) / sum(rate(container_cpu_cfs_periods_total{namespace=~\"$namespace\"}[5m])) by (pod) * 100", "legendFormat": "{{pod}}", "refId": "A"}],
"title": "CPU Throttling % per Pod",
"type": "timeseries"
},
{
"datasource": {"type": "prometheus", "uid": "Prometheus"},
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 5}]},
"unit": "short"
},
"overrides": []
},
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 36},
"id": 7,
"options": {
"showHeader": true,
"cellHeight": "sm",
"footer": {"show": false, "reducer": ["sum"], "countRows": false, "fields": ""}
},
"pluginVersion": "10.2.3",
"targets": [
{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "sum by (namespace, pod) (container_memory_working_set_bytes{container!=\"\",container!=\"POD\",namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "A"},
{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "sum by (namespace, pod) (rate(container_cpu_usage_seconds_total{container!=\"\",container!=\"POD\",namespace=~\"$namespace\"}[5m]))", "format": "table", "instant": true, "refId": "B"},
{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "sum by (namespace, pod) (rate(container_network_receive_bytes_total{namespace=~\"$namespace\"}[5m]))", "format": "table", "instant": true, "refId": "C"}
],
"title": "Pod Resource Summary (live)",
"type": "table",
"transformations": [
{"id": "merge", "options": {}},
{"id": "groupBy", "options": {"fields": {"Value": {"aggregations": ["lastNotNull"], "operation": "aggregate"}, "Value #B": {"aggregations": ["lastNotNull"], "operation": "aggregate"}, "Value #C": {"aggregations": ["lastNotNull"], "operation": "aggregate"}, "namespace": {"aggregations": [], "operation": "groupby"}, "pod": {"aggregations": [], "operation": "groupby"}}}},
{"id": "organize", "options": {"excludeByName": {"Time": true}, "renameByName": {"Value": "Memory (bytes)", "Value #B": "CPU (cores)", "Value #C": "Network RX (Bps)"}}}
]
},
{
"datasource": {"type": "prometheus", "uid": "Prometheus"},
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {
"drawStyle": "line", "fillOpacity": 10, "lineInterpolation": "linear", "lineWidth": 1, "showPoints": "never", "spanNulls": true,
"stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "off"}
},
"mappings": [],
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
"unit": "short"
},
"overrides": []
},
"gridPos": {"h": 9, "w": 12, "x": 0, "y": 46},
"id": 8,
"options": {
"legend": {"calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true},
"tooltip": {"mode": "multi", "sort": "desc"}
},
"targets": [{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "sum by (namespace) (kube_pod_status_phase{phase=~\"Running|Pending|Failed\",namespace=~\"$namespace\"})", "legendFormat": "{{namespace}} {{phase}}", "refId": "A"}],
"title": "Pod Status by Namespace (KSM)",
"type": "timeseries"
},
{
"datasource": {"type": "prometheus", "uid": "Prometheus"},
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {
"drawStyle": "line", "fillOpacity": 10, "lineInterpolation": "linear", "lineWidth": 1, "showPoints": "never", "spanNulls": true,
"stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "off"}
},
"mappings": [],
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
"unit": "short"
},
"overrides": []
},
"gridPos": {"h": 9, "w": 12, "x": 12, "y": 46},
"id": 9,
"options": {
"legend": {"calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true},
"tooltip": {"mode": "multi", "sort": "desc"}
},
"targets": [{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "sum by (namespace) (increase(kube_pod_container_status_restarts_total{namespace=~\"$namespace\"}[1h]))", "legendFormat": "{{namespace}}", "refId": "A"}],
"title": "Container Restarts (last 1h)",
"type": "timeseries"
},
{
"datasource": {"type": "prometheus", "uid": "Prometheus"},
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {
"drawStyle": "line", "fillOpacity": 10, "lineInterpolation": "linear", "lineWidth": 1, "showPoints": "never", "spanNulls": true,
"stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "off"}
},
"mappings": [],
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
"unit": "bytes"
},
"overrides": []
},
"gridPos": {"h": 9, "w": 24, "x": 0, "y": 55},
"id": 10,
"options": {
"legend": {"calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true},
"tooltip": {"mode": "multi", "sort": "desc"}
},
"targets": [{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "kube_persistentvolumeclaim_resource_requests_storage_bytes{namespace=~\"$namespace\"}", "legendFormat": "{{namespace}}/{{persistentvolumeclaim}}", "refId": "A"}],
"title": "PVC Storage Requests by Claim (KSM)",
"type": "timeseries"
}
],
"refresh": "30s",
"schemaVersion": 38,
"style": "dark",
"tags": ["k3s", "pods"],
"templating": {
"list": [
{
"allValue": ".*",
"current": {"selected": true, "text": "All", "value": "$__all"},
"datasource": {"type": "prometheus", "uid": "Prometheus"},
"definition": "label_values(container_cpu_usage_seconds_total, namespace)",
"hide": 0,
"includeAll": true,
"multi": true,
"name": "namespace",
"options": [],
"query": "label_values(container_cpu_usage_seconds_total, namespace)",
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
}
]
},
"time": {"from": "now-6h", "to": "now"},
"timepicker": {},
"timezone": "",
"title": "Pods & Services",
"uid": "k3s-pods",
"version": 2,
"weekStart": ""
}