monitoring: add Grafana dashboards + kube-state-metrics & node-exporter
Dashboards (provisioned via ConfigMaps into Grafana pod, 'K3s Cluster' folder): - Cluster Overview: per-namespace CPU/mem/net/fs, pod counts, pod health (KSM) - Pods & Services: per-pod CPU/mem/net/fs, throttling, pod status, restarts, PVCs - Nodes: per-node CPU%/mem%, load average, disk usage, network (node-exporter) - Control Plane & API Server: request rate, latency p95, 5xx, kubelet/PLEG - Prometheus Self-Monitoring: ingestion, series, scrape duration, memory Exporters (auto-scraped via existing kubernetes-service-endpoints job): - kube-state-metrics: pod/deployment/PVC/replica state (kube_pod_status_phase, kube_pod_container_status_restarts_total, kube_persistentvolumeclaim_*) - node-exporter (DaemonSet, hostNetwork): node_cpu_seconds_total, node_memory_*, node_filesystem_*, node_load*, node_network_*
This commit is contained in:
118
monitoring/kube-state-metrics.yaml
Normal file
118
monitoring/kube-state-metrics.yaml
Normal file
@@ -0,0 +1,118 @@
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: kube-state-metrics
|
||||
namespace: monitoring
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: kube-state-metrics
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- configmaps
|
||||
- secrets
|
||||
- nodes
|
||||
- pods
|
||||
- services
|
||||
- resourcequotas
|
||||
- replicationcontrollers
|
||||
- limitranges
|
||||
- persistentvolumeclaims
|
||||
- persistentvolumes
|
||||
- namespaces
|
||||
- endpoints
|
||||
verbs: ["list", "watch"]
|
||||
- apiGroups: ["apps"]
|
||||
resources: ["statefulsets", "daemonsets", "deployments", "replicasets"]
|
||||
verbs: ["list", "watch"]
|
||||
- apiGroups: ["batch"]
|
||||
resources: ["cronjobs", "jobs"]
|
||||
verbs: ["list", "watch"]
|
||||
- apiGroups: ["autoscaling"]
|
||||
resources: ["horizontalpodautoscalers"]
|
||||
verbs: ["list", "watch"]
|
||||
- apiGroups: ["networking.k8s.io"]
|
||||
resources: ["ingresses"]
|
||||
verbs: ["list", "watch"]
|
||||
- apiGroups: ["storage.k8s.io"]
|
||||
resources: ["storageclasses", "volumeattachments"]
|
||||
verbs: ["list", "watch"]
|
||||
- apiGroups: ["certificates.k8s.io"]
|
||||
resources: ["certificatesigningrequests"]
|
||||
verbs: ["list", "watch"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: kube-state-metrics
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: kube-state-metrics
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: kube-state-metrics
|
||||
namespace: monitoring
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: kube-state-metrics
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: kube-state-metrics
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: kube-state-metrics
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: kube-state-metrics
|
||||
spec:
|
||||
serviceAccountName: kube-state-metrics
|
||||
containers:
|
||||
- name: kube-state-metrics
|
||||
image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.10.1
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: http-metrics
|
||||
- containerPort: 8081
|
||||
name: telemetry
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
port: 8081
|
||||
initialDelaySeconds: 5
|
||||
timeoutSeconds: 5
|
||||
resources:
|
||||
requests:
|
||||
memory: "128Mi"
|
||||
cpu: "100m"
|
||||
limits:
|
||||
memory: "512Mi"
|
||||
cpu: "500m"
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: kube-state-metrics
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: kube-state-metrics
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "8080"
|
||||
spec:
|
||||
selector:
|
||||
app: kube-state-metrics
|
||||
ports:
|
||||
- name: http-metrics
|
||||
port: 8080
|
||||
targetPort: http-metrics
|
||||
- name: telemetry
|
||||
port: 8081
|
||||
targetPort: telemetry
|
||||
Reference in New Issue
Block a user