Dashboards (provisioned via ConfigMaps into Grafana pod, 'K3s Cluster' folder): - Cluster Overview: per-namespace CPU/mem/net/fs, pod counts, pod health (KSM) - Pods & Services: per-pod CPU/mem/net/fs, throttling, pod status, restarts, PVCs - Nodes: per-node CPU%/mem%, load average, disk usage, network (node-exporter) - Control Plane & API Server: request rate, latency p95, 5xx, kubelet/PLEG - Prometheus Self-Monitoring: ingestion, series, scrape duration, memory Exporters (auto-scraped via existing kubernetes-service-endpoints job): - kube-state-metrics: pod/deployment/PVC/replica state (kube_pod_status_phase, kube_pod_container_status_restarts_total, kube_persistentvolumeclaim_*) - node-exporter (DaemonSet, hostNetwork): node_cpu_seconds_total, node_memory_*, node_filesystem_*, node_load*, node_network_*
332 lines
14 KiB
YAML
332 lines
14 KiB
YAML
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-dashboard-cluster-overview
|
|
namespace: monitoring
|
|
labels:
|
|
app: grafana
|
|
grafana_dashboard: "1"
|
|
data:
|
|
cluster-overview.json: |
|
|
{
|
|
"annotations": {"list": []},
|
|
"editable": true,
|
|
"fiscalYearStartMonth": 0,
|
|
"graphTooltip": 1,
|
|
"id": null,
|
|
"links": [],
|
|
"liveNow": false,
|
|
"panels": [
|
|
{
|
|
"datasource": {"type": "prometheus", "uid": "Prometheus"},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": {"mode": "thresholds"},
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": null}
|
|
]
|
|
},
|
|
"unit": "s"
|
|
},
|
|
"overrides": []
|
|
},
|
|
"gridPos": {"h": 5, "w": 4, "x": 0, "y": 0},
|
|
"id": 1,
|
|
"options": {
|
|
"colorMode": "value",
|
|
"graphMode": "area",
|
|
"justifyMode": "auto",
|
|
"orientation": "auto",
|
|
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
|
|
"textMode": "auto"
|
|
},
|
|
"pluginVersion": "10.2.3",
|
|
"targets": [{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "time() - max(process_start_time_seconds{job=\"prometheus\"})", "refId": "A"}],
|
|
"title": "Prometheus Uptime",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"datasource": {"type": "prometheus", "uid": "Prometheus"},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": {"mode": "thresholds"},
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "red", "value": null},
|
|
{"color": "green", "value": 1}
|
|
]
|
|
}
|
|
},
|
|
"overrides": []
|
|
},
|
|
"gridPos": {"h": 5, "w": 4, "x": 4, "y": 0},
|
|
"id": 2,
|
|
"options": {
|
|
"colorMode": "background",
|
|
"graphMode": "none",
|
|
"justifyMode": "center",
|
|
"orientation": "horizontal",
|
|
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
|
|
"textMode": "value_and_name"
|
|
},
|
|
"pluginVersion": "10.2.3",
|
|
"targets": [{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "count(kubelet_running_pods)", "refId": "A"}],
|
|
"title": "Running Pods (total)",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"datasource": {"type": "prometheus", "uid": "Prometheus"},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": {"mode": "thresholds"},
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": null}
|
|
]
|
|
}
|
|
},
|
|
"overrides": []
|
|
},
|
|
"gridPos": {"h": 5, "w": 4, "x": 8, "y": 0},
|
|
"id": 3,
|
|
"options": {
|
|
"colorMode": "background",
|
|
"graphMode": "none",
|
|
"justifyMode": "center",
|
|
"orientation": "horizontal",
|
|
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
|
|
"textMode": "value_and_name"
|
|
},
|
|
"pluginVersion": "10.2.3",
|
|
"targets": [{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "sum(kubelet_running_containers)", "refId": "A"}],
|
|
"title": "Running Containers",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"datasource": {"type": "prometheus", "uid": "Prometheus"},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": {"mode": "thresholds"},
|
|
"mappings": [
|
|
{"options": {"0": {"text": "Down", "color": "red"}, "1": {"text": "Up", "color": "green"}}, "type": "value"}
|
|
],
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "red", "value": null},
|
|
{"color": "green", "value": 1}
|
|
]
|
|
}
|
|
},
|
|
"overrides": []
|
|
},
|
|
"gridPos": {"h": 5, "w": 12, "x": 12, "y": 0},
|
|
"id": 4,
|
|
"options": {
|
|
"colorMode": "background",
|
|
"graphMode": "none",
|
|
"justifyMode": "center",
|
|
"orientation": "horizontal",
|
|
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
|
|
"textMode": "value_and_name"
|
|
},
|
|
"pluginVersion": "10.2.3",
|
|
"targets": [{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "up{job=\"kubernetes-apiservers\"}", "refId": "A"}, {"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "up{job=\"kubernetes-nodes\"}", "refId": "B"}],
|
|
"title": "Control Plane & Node Exporters",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"datasource": {"type": "prometheus", "uid": "Prometheus"},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": {"mode": "palette-classic"},
|
|
"custom": {
|
|
"axisCenteredZero": false,
|
|
"axisColorMode": "text",
|
|
"axisLabel": "",
|
|
"axisPlacement": "auto",
|
|
"barAlignment": 0,
|
|
"drawStyle": "line",
|
|
"fillOpacity": 10,
|
|
"gradientMode": "none",
|
|
"hideFrom": {"legend": false, "tooltip": false, "viz": false},
|
|
"insertNulls": false,
|
|
"lineInterpolation": "linear",
|
|
"lineWidth": 1,
|
|
"pointSize": 5,
|
|
"scaleDistribution": {"type": "linear"},
|
|
"showPoints": "never",
|
|
"spanNulls": true,
|
|
"stacking": {"group": "A", "mode": "none"},
|
|
"thresholdsStyle": {"mode": "off"}
|
|
},
|
|
"mappings": [],
|
|
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
|
|
"unit": "bytes"
|
|
},
|
|
"overrides": []
|
|
},
|
|
"gridPos": {"h": 9, "w": 12, "x": 0, "y": 5},
|
|
"id": 10,
|
|
"options": {
|
|
"legend": {"calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true},
|
|
"tooltip": {"mode": "multi", "sort": "desc"}
|
|
},
|
|
"targets": [{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "sum(container_memory_working_set_bytes{container!=\"\",container!=\"POD\"}) by (namespace)", "legendFormat": "{{namespace}}", "refId": "A"}],
|
|
"title": "Memory Usage by Namespace",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {"type": "prometheus", "uid": "Prometheus"},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": {"mode": "palette-classic"},
|
|
"custom": {
|
|
"drawStyle": "line", "fillOpacity": 10, "lineInterpolation": "linear", "lineWidth": 1, "showPoints": "never", "spanNulls": true,
|
|
"stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "off"}
|
|
},
|
|
"mappings": [],
|
|
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
|
|
"unit": "core"
|
|
},
|
|
"overrides": []
|
|
},
|
|
"gridPos": {"h": 9, "w": 12, "x": 12, "y": 5},
|
|
"id": 11,
|
|
"options": {
|
|
"legend": {"calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true},
|
|
"tooltip": {"mode": "multi", "sort": "desc"}
|
|
},
|
|
"targets": [{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "sum(rate(container_cpu_usage_seconds_total{container!=\"\",container!=\"POD\"}[5m])) by (namespace)", "legendFormat": "{{namespace}}", "refId": "A"}],
|
|
"title": "CPU Usage by Namespace",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {"type": "prometheus", "uid": "Prometheus"},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": {"mode": "palette-classic"},
|
|
"custom": {
|
|
"drawStyle": "line", "fillOpacity": 10, "lineInterpolation": "linear", "lineWidth": 1, "showPoints": "never", "spanNulls": true,
|
|
"stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "off"}
|
|
},
|
|
"mappings": [],
|
|
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
|
|
"unit": "Bps"
|
|
},
|
|
"overrides": []
|
|
},
|
|
"gridPos": {"h": 9, "w": 12, "x": 0, "y": 14},
|
|
"id": 12,
|
|
"options": {
|
|
"legend": {"calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true},
|
|
"tooltip": {"mode": "multi", "sort": "desc"}
|
|
},
|
|
"targets": [
|
|
{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "sum(rate(container_network_receive_bytes_total[5m])) by (namespace)", "legendFormat": "RX {{namespace}}", "refId": "A"},
|
|
{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "sum(rate(container_network_transmit_bytes_total[5m])) by (namespace)", "legendFormat": "TX {{namespace}}", "refId": "B"}
|
|
],
|
|
"title": "Network RX/TX by Namespace",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {"type": "prometheus", "uid": "Prometheus"},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": {"mode": "palette-classic"},
|
|
"custom": {
|
|
"drawStyle": "line", "fillOpacity": 10, "lineInterpolation": "linear", "lineWidth": 1, "showPoints": "never", "spanNulls": true,
|
|
"stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "off"}
|
|
},
|
|
"mappings": [],
|
|
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
|
|
"unit": "decbytes"
|
|
},
|
|
"overrides": []
|
|
},
|
|
"gridPos": {"h": 9, "w": 12, "x": 12, "y": 14},
|
|
"id": 13,
|
|
"options": {
|
|
"legend": {"calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true},
|
|
"tooltip": {"mode": "multi", "sort": "desc"}
|
|
},
|
|
"targets": [{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "sum(container_fs_usage_bytes) by (instance)", "legendFormat": "{{instance}}", "refId": "A"}],
|
|
"title": "Filesystem Usage by Node",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {"type": "prometheus", "uid": "Prometheus"},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": {"mode": "thresholds"},
|
|
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
|
|
"unit": "short"
|
|
},
|
|
"overrides": []
|
|
},
|
|
"gridPos": {"h": 9, "w": 24, "x": 0, "y": 23},
|
|
"id": 20,
|
|
"options": {
|
|
"showHeader": true,
|
|
"cellHeight": "sm",
|
|
"footer": {"show": false, "reducer": ["sum"], "countRows": false, "fields": ""}
|
|
},
|
|
"pluginVersion": "10.2.3",
|
|
"targets": [{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "sort_desc(sum(container_memory_working_set_bytes{container!=\"\",container!=\"POD\"}) by (namespace,pod))", "format": "table", "instant": true, "refId": "A"}],
|
|
"title": "Pods by Memory (live)",
|
|
"type": "table",
|
|
"transformations": [
|
|
{"id": "organize", "options": {"excludeByName": {"Time": true}, "renameByName": {"Value": "Memory (bytes)"}}}
|
|
]
|
|
},
|
|
{
|
|
"datasource": {"type": "prometheus", "uid": "Prometheus"},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": {"mode": "thresholds"},
|
|
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "orange", "value": 1}, {"color": "red", "value": 5}]},
|
|
"unit": "short"
|
|
},
|
|
"overrides": []
|
|
},
|
|
"gridPos": {"h": 9, "w": 24, "x": 0, "y": 32},
|
|
"id": 30,
|
|
"options": {
|
|
"showHeader": true,
|
|
"cellHeight": "sm",
|
|
"footer": {"show": false, "reducer": ["sum"], "countRows": false, "fields": ""}
|
|
},
|
|
"pluginVersion": "10.2.3",
|
|
"targets": [
|
|
{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "sum(kube_pod_status_phase{phase=\"Running\"}) by (namespace)", "format": "table", "instant": true, "refId": "A"},
|
|
{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "sum(kube_pod_status_phase{phase=\"Pending\"}) by (namespace)", "format": "table", "instant": true, "refId": "B"},
|
|
{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "sum(kube_pod_status_phase{phase=\"Failed\"}) by (namespace)", "format": "table", "instant": true, "refId": "C"},
|
|
{"datasource": {"type": "prometheus", "uid": "Prometheus"}, "expr": "sum(increase(kube_pod_container_status_restarts_total[1h])) by (namespace)", "format": "table", "instant": true, "refId": "D"}
|
|
],
|
|
"title": "Pod Health by Namespace (KSM)",
|
|
"type": "table",
|
|
"transformations": [
|
|
{"id": "merge", "options": {}},
|
|
{"id": "groupBy", "options": {"fields": {"Value": {"aggregations": ["lastNotNull"], "operation": "aggregate"}, "Value #B": {"aggregations": ["lastNotNull"], "operation": "aggregate"}, "Value #C": {"aggregations": ["lastNotNull"], "operation": "aggregate"}, "Value #D": {"aggregations": ["lastNotNull"], "operation": "aggregate"}, "namespace": {"aggregations": [], "operation": "groupby"}}}},
|
|
{"id": "organize", "options": {"excludeByName": {"Time": true}, "renameByName": {"Value": "Running", "Value #B": "Pending", "Value #C": "Failed", "Value #D": "Restarts (1h)"}}}
|
|
]
|
|
}
|
|
],
|
|
"refresh": "30s",
|
|
"schemaVersion": 38,
|
|
"style": "dark",
|
|
"tags": ["k3s", "overview"],
|
|
"templating": {"list": []},
|
|
"time": {"from": "now-6h", "to": "now"},
|
|
"timepicker": {},
|
|
"timezone": "",
|
|
"title": "Cluster Overview",
|
|
"uid": "k3s-cluster-overview",
|
|
"version": 2,
|
|
"weekStart": ""
|
|
}
|