diff --git a/argocd/apps/platform-engineer.yaml b/argocd/apps/platform-engineer.yaml new file mode 100644 index 0000000..f57c31a --- /dev/null +++ b/argocd/apps/platform-engineer.yaml @@ -0,0 +1,24 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: platform-engineer + namespace: argocd + annotations: + argocd.argoproj.io/sync-wave: "0" +spec: + project: k3s-cluster + source: + repoURL: https://git.rogi.casa/roger/k3s-cluster.git + targetRevision: main + path: platform-engineer + directory: + recurse: true + destination: + server: https://kubernetes.default.svc + namespace: platform-engineer + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=false diff --git a/argocd/gen-apps.sh b/argocd/gen-apps.sh index b5d7e63..31f9b13 100755 --- a/argocd/gen-apps.sh +++ b/argocd/gen-apps.sh @@ -38,6 +38,7 @@ APPS=( "openwebui|openwebui|openwebui|true|true" "phoenix|phoenix|phoenix|true|false" "pihole|pihole|pihole|true|true" + "platform-engineer|platform-engineer|platform-engineer|true|true" "qbittorrent|qbittorrent|qbittorrent|true|true" "vaultwarden|vaultwarden|vaultwarden|true|true" ) diff --git a/litellm/litellm.yaml b/litellm/litellm.yaml index a814a29..eee4a94 100644 --- a/litellm/litellm.yaml +++ b/litellm/litellm.yaml @@ -26,7 +26,12 @@ data: - model_name: glm-4.7-flash litellm_params: model: ollama/glm-4.7-flash - api_base: http://10.88.88.235:11434 + api_base: http://10.88.20.12:11434 + # Used by the platform-engineer Hermes agent (deployed in ns platform-engineer). + - model_name: qwen-3.6:27b + litellm_params: + model: ollama/qwen3.6:27b + api_base: http://10.88.20.12:11434 litellm_settings: #set_verbose: True # Uncomment this if you want to see verbose logs; not recommended in production callbacks: ["arize_phoenix"] diff --git a/platform-engineer/README.md b/platform-engineer/README.md new file mode 100644 index 0000000..96a7ebc --- /dev/null +++ b/platform-engineer/README.md @@ -0,0 +1,367 @@ +# Platform Engineer Agent — Deployment Plan + +An autonomous **Hermes Agent** that runs inside the k3s cluster, watches its +health on a schedule, tries to fix simple problems, and notifies me (via +Discord) when something needs my attention or a fix failed. + +Docs: https://hermes-agent.nousresearch.com/docs/user-guide/docker + +--- + +## 1. Goal & operating model + +- **One Hermes container** in a new namespace `platform-engineer`, scheduled on + the powerful amd64 node (`roger-nucbox-evo-x2`, 24 GiB RAM). +- Hermes runs in **gateway mode** under s6 supervision (`command: gateway run`), + so the built-in **cron scheduler** is active and survives restarts. +- The agent talks to the cluster with `kubectl` from *inside* the container + (terminal backend = `local`). We give the pod a **ServiceAccount + ClusterRole** + scoped to read-mostly + restart/scale/delete-pod permissions. +- LLM calls are routed through the in-cluster **LiteLLM** proxy + (`litellm.rogi.casa`) — no external API keys needed in the cluster. +- Notifications go to **Discord** (reuse the pattern from `myorg-assistant`). +- A set of **cron jobs** (Hermes-native, not Kubernetes CronJobs) make the agent + run periodic checks. Watchdog checks use `[SILENT]` so it only pings me when + something is wrong. + +Why Hermes-native cron (not k8s CronJobs): +- Hermes cron ticks inside the gateway, runs in an isolated agent session, + supports `[SILENT]` suppression, `deliver="discord"`, `workdir`, and + `context_from` chaining — far less plumbing than spawning a fresh pod per run. +- Cron jobs live in `~/.hermes/cron/jobs.json` on the PVC, so they survive pod + restarts and can be edited live via `hermes cron edit` without redeploying. + +--- + +## 2. Files to create (this directory) + +``` +platform-engineer/ +├── namespace.yaml # namespace platform-engineer +├── rbac.yaml # ServiceAccount + ClusterRole (+binding) +├── configmap.yaml # hermes config.yaml + SOUL.md + cron seed script +├── secret.yaml # DISCORD bot token, LITELLM_API_KEY, kubeconfig-less SA token +├── pvc.yaml # persistent /opt/data (HERMES_HOME) +├── dockerfile # derived image: hermes-agent + kubectl + helm +├── deployment.yaml # Deployment, schedules on amd64, mounts kube SA token +├── ingress.yaml # hermes.rogi.casa → dashboard (optional) +└── README.md # this file +``` + +Then add a line to `argocd/gen-apps.sh` `APPS=(...)`: +``` +"platform-engineer|platform-engineer|platform-engineer|true|true" +``` +and re-run `./argocd/gen-apps.sh` to generate `argocd/apps/platform-engineer.yaml` +so ArgoCD reconciles it like every other app in the repo. + +--- + +## 3. RBAC — least privilege + +ServiceAccount `platform-engineer` in ns `platform-engineer`, bound to a +**ClusterRole** scoped to *platform engineer* actions: + +**Read (get/list/watch):** nodes, pods, services, deployments, statefulsets, +daemonsets, replicasets, jobs, cronjobs, events, configmaps, secrets, PVCs, +ingresses, namespaces. + +**Act (patch/update on a allowlist):** +- `pods` → `delete` (force-restart a stuck pod), `patch` (`/evict`, annotations) +- `deployments`, `statefulsets`, `daemonsets`, `replicasets` → `patch` (restart + via `kubectl rollout restart` / scale), `update` +- `jobs`, `cronjobs` → `delete`, `patch` +- `pods/exec` (subresource) → `create` (only if we want the agent to `kubectl + exec` into pods for log-style debugging — optional; keep off initially) +- `events` → `get/list/watch` only + +**No cluster-scoped writes** (no creating namespaces, no node taints, no RBAC +edits, no CRDs). The agent can *propose* those and tell me; it cannot do them +itself. All mutating calls are auditable via Kubernetes audit logs and +`kubectl auth can-i --as=system:serviceaccount:platform-engineer:platform-engineer`. + +The pod uses the k3s in-cluster ServiceAccount token (`/var/run/secrets/... +/serviceaccount/token`) + the `KUBERNETES_SERVICE_HOST/PORT` env vars k3s already +injects — **no kubeconfig file, no long-lived token on disk**. + +--- + +## 4. Image — thin derived Dockerfile + +```dockerfile +FROM nousresearch/hermes-agent:latest +USER root +RUN apt-get update \ + && apt-get install -y --no-install-recommends curl gnupg \ + && curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.35/deb/Release.key \ + | gpg --dearmor -o /usr/share/keyrings/kubernetes-apt-keyring.gpg \ + && echo 'deb [signed-by=/usr/share/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.35/deb/ /' \ + > /etc/apt/sources.list.d/kubernetes.list \ + && apt-get update \ + && apt-get install -y --no-install-recommends kubectl \ + && curl -fsSL https://get.helm.sh/helm-v3.16.0-linux-amd64.tar.gz \ + | tar -xz -C /usr/local/bin --strip-components=1 linux-amd64/helm \ + && rm -rf /var/lib/apt/lists/* +USER hermes +``` + +> Note: the cluster is mixed arch (arm64/amd64/arm). The agent pod is pinned to +> the amd64 node, so `linux-amd64` helm + `kubectl` packages are fine. If you +> later want it portable, switch to a multi-arch build with +> `TARGETARCH` and install matching helm arch. + +Build & push to your Gitea registry (`git.rogi.casa/roger/...`) — same +`imagePullSecrets: gitea-registry` pattern as `gym-tracker`. Tag with the +hermes version + a short git sha. + +--- + +## 5. Hermes configuration (mounted via ConfigMap → /opt/data/config.yaml) + +```yaml +# config.yaml (seeded into the PVC on first boot) +model: + provider: openai-api + default: claude-4.5-haiku + base_url: "https://litellm.rogi.casa/v1" + api_mode: chat_completions + +# Use a cheap, fast model for auxiliary tasks (titling, compression) +auxiliary: + compression: + provider: openai-api + model: gemini-3-flash + title_generation: + provider: openai-api + model: gemini-3-flash + +terminal: + backend: local + cwd: /workspace # a working dir for any kubectl output / scratch + timeout: 180 + home_mode: profile # isolate tool credentials under HERMES_HOME/home + +# Unattended gateway → circuit-breaker on tool-call loops +tool_loop_guardrails: + hard_stop_enabled: true + hard_stop_after: + exact_failure: 5 + idempotent_no_progress: 5 + +sessions: + auto_prune: true + retention_days: 90 + +cron: + wrap_response: false # cleaner Discord messages + +memory: + memory_enabled: true + user_profile_enabled: true +``` + +`.env` (from Secret, mounted to `/opt/data/.env`): +``` +OPENAI_API_KEY= +OPENAI_BASE_URL=https://litellm.rogi.casa/v1 +DISCORD_BOT_TOKEN= +DISCORD_HOME_CHANNEL= +# Dashboard auth (homelab, trusted LAN behind ingress) +HERMES_DASHBOARD_BASIC_AUTH_USERNAME=roger +HERMES_DASHBOARD_BASIC_AUTH_PASSWORD= +``` + +> Why `OPENAI_API_KEY` + `OPENAI_BASE_URL`: the `openai-api` provider honours +> `OPENAI_BASE_URL`, so this is the simplest way to point Hermes at the +> in-cluster LiteLLM. `claude-4.5-haiku` / `gemini-3-flash` are the model names +> already exposed by your `litellm/litellm.yaml` ConfigMap. + +`SOUL.md` (personality + guardrails) — see `configmap.yaml`. Key points: +- Identity: "Platform Engineer for the rogi.casa k3s cluster." +- Knows the cluster layout (3 nodes, ArgoCD GitOps, Traefik+cert-manager, + LiteLLM, services list). +- Operating rules: read-first; only act on the allowlisted verbs; never edit + RBAC / taints / namespaces / CRDs; when in doubt, notify instead of acting; + always cite the resource and the command used. +- How to reach me: `deliver="discord"`. + +--- + +## 6. Deployment + +- `replicas: 1` (Hermes data dir is single-writer — never scale >1). +- `nodeSelector: kubernetes.io/arch: amd64` + preferred `hardware: high-memory` + affinity → lands on the NUC. +- `resources`: requests 512Mi/250m, limits 2Gi/1 core (Hermes recommends + 2–4 GiB; 1 GiB is fine without browser tools, which we keep off). +- Volume: PVC mounted at `/opt/data` (HERMES_HOME), RWX not needed (single pod). +- Ports: 8642 (gateway API, internal only) and 9119 (dashboard) → exposed via + Ingress `hermes.rogi.casa` with TLS + basic-auth (already enforced by the + `HERMES_DASHBOARD_BASIC_AUTH_*` env vars). +- `imagePullSecrets: gitea-registry`. +- env from Secret; `HERMES_DASHBOARD=1`. +- Init: on first boot the s6 `01-hermes-setup` hook seeds config/SOUL/.env from + the ConfigMap if the volume is empty. We mount the ConfigMap as a readonly + projection at `/opt/seed/` and run a tiny initContainer to copy it into + `/opt/data` only when `/opt/data/config.yaml` doesn't exist (so ArgoCD + self-heal never fights the agent's live-edited config). + +--- + +## 7. Cron jobs to seed (Hermes-native) + +These are written by an init script (one-shot Job `hermes-cron-seed`) that runs +`hermes cron create ...` against the gateway on first install, and is idempotent +(it checks existing job names). All deliver to Discord. Examples: + +| Name | Schedule | Prompt (abbreviated) | +|------|----------|------------------------| +| `cluster-health-check` | `every 15m` | Run `kubectl get nodes,pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded` and `kubectl get events -A --field-selector type=Warning --since=20m`. If everything healthy, reply with only `[SILENT]`. Otherwise summarize failures and root-cause briefly. | +| `pod-restart-loop` | `every 10m` | Find pods in `CrashLoopBackOff`/`ImagePullBackOff` across all namespaces. For `CrashLoopBackOff`, fetch logs and if a clear transient cause (OOM, config parse, missing secret) is visible, attempt `kubectl rollout restart `; otherwise notify me with the log excerpt. Reply `[SILENT]` if none found. | +| `pvc-pressure` | `every 30m` | `kubectl get pv` + node disk via `kubectl top nodes`. Alert if any PVC `Bound` to a near-full volume or node disk >85%. `[SILENT]` otherwise. | +| `argocd-sync-health` | `every 1h` | `kubectl get applications -n argocd -o wide` (or `argocd app sync --dry-run` if CLI present). Report any `OutOfSync`/`Degraded` app. `[SILENT]` if all `Synced`+`Healthy`. | +| `cert-expiry` | `every 1d at 09:00` | List cert-manager `Certificate` resources with expiry < 21 days. Notify only if any. `[SILENT]` otherwise. | +| `node-resource-drift` | `every 30m` | `kubectl top nodes`. Alert if any node CPU>90% or mem>90% sustained, or any node `NotReady`. `[SILENT]` otherwise. | +| `daily-cluster-report` | `0 8 * * *` | Summarize: node count/status, top 5 pods by CPU/mem, # pods not Running, # ArgoCD apps OutOfSync, cert warnings. Always deliver (no `[SILENT]`). | + +Design rules baked into SOUL.md: +- **Read-only checks** run frequently (10–30m) and stay silent unless wrong. +- **Mutating actions** are restricted to safe idempotent ones (rollout restart, + delete stuck pod so controller recreates). Anything riskier → notify me with + a proposed command and wait for me to run it (I can reply in Discord to the + continuable thread). +- Cron sessions are isolated and **cannot create new cron jobs** (Hermes + disables that inside cron runs) → no runaway loops. + +--- + +## 8. Safety & guardrails + +1. **RBAC is the real boundary.** Even if the agent goes rogue, the SA can't + touch other namespaces' secrets beyond read, can't change RBAC, can't taint + nodes, can't create namespaces. +2. **`tool_loop_guardrails.hard_stop_enabled: true`** — circuit-breaks a stuck + gateway (recommended in the Docker doc for unattended deployments). +3. **`skills.write_approval: false` but `memory.write_approval: true`** (so the + agent can build skills/memories but I review memory writes lazily — flip + this if it gets noisy). +4. **No `pods/exec` subresource** initially (keep the agent from shelling into + workloads). Enable later only if you want log-grep-style debugging. +5. **Dashboard behind ingress TLS + basic auth** (the June-2026 hardening makes + auth mandatory on non-loopback binds; we satisfy it with the bundled + basic-auth provider). +6. **Single replica / single-writer PVC** — the Docker doc is explicit that two + gateways on the same `/opt/data` corrupt session/memory stores. Use a + `podAntiAffinity` so an accidental scale-up doesn't co-run. +7. **ArgoCD interaction:** keep `syncPolicy.automated.prune+selfHeal` but + exclude the live-edited hermes state. Practically: Argo owns the *manifests* + (deployment, configmap, secret, pvc), while `/opt/data` (config.yaml, + cron/jobs.json, SOUL.md edits made via the dashboard) is runtime state on the + PVC and is *not* reconciled by Argo. The ConfigMap only *seeds* it on first + boot. Document this clearly in the README so future-you doesn't expect Argo + to reset the agent's personality. + +--- + +## 9. Rollout plan + +1. Build & push the derived image to `git.rogi.casa/roger/hermes-agent` (tag + `v1.35-`). +2. Create the namespace + RBAC + Secret + ConfigMap + PVC: + `kubectl apply -f platform-engineer/`. +3. Create the `platform-engineer` Discord bot, invite it, put its token + your + channel id in `secret.yaml` (base64). +4. Apply the Deployment; wait for the pod to go Running. +5. `kubectl exec` in and run the one-shot cron seed: + `hermes cron create ...` (or apply the `cron-seed` Job). +6. Trigger the first `cluster-health-check` manually: `hermes cron run cluster-health-check`. +7. Add the app to `argocd/gen-apps.sh`, regenerate, commit, push. + +--- + +## 10. Decisions (locked in) + +1. **Notifications:** dedicated `platform-engineer` Discord bot → its own token + in `secret.yaml` (`DISCORD_BOT_TOKEN`, `DISCORD_HOME_CHANNEL`). +2. **Dashboard:** public at `hermes.rogi.casa` (Traefik TLS + cert-manager + the + bundled Hermes basic-auth provider). Reach the dashboard on port 9119; the + gateway API on 8642 is ClusterIP-only. +3. **Image:** derived image pushed to `git.rogi.casa/roger/hermes-agent`, pulled + via the existing `gitea-registry` imagePullSecret (must also exist in the + `platform-engineer` ns — see deploy steps). +4. **Model:** `qwen-3.6:27b` via the in-cluster Ollama box (`10.88.20.12:11434`), + exposed through LiteLLM as `qwen-3.6:27b`. Added to `litellm/litellm.yaml`. + Hermes reaches LiteLLM at `https://litellm.rogi.casa/v1` (never Ollama directly). +5. **pods/exec:** granted (`pods/exec` → `create` in the ClusterRole) so the + agent can `kubectl exec`/`kubectl logs` for debugging. + +--- + +## 11. Deployment checklist (do in this order) + +1. **Add the Ollama model to LiteLLM** (already done in `litellm/litellm.yaml`): + the `qwen-3.6:27b` entry points at `http://10.88.20.12:11434`. Make sure + `qwen3.6:27b` is actually pulled on that Ollama host + (`ollama pull qwen3.6:27b`). Apply: `kubectl apply -f litellm/` and restart + the LiteLLM pod so the new config takes effect. +2. **Create the `gitea-registry` secret in the new namespace** (ArgoCD won't + create it — it's not in the repo): + ``` + kubectl create namespace platform-engineer + kubectl create secret docker-registry gitea-registry \ + --docker-server=git.rogi.casa \ + --docker-username= \ + --docker-password= \ + --docker-email= \ + -n platform-engineer + ``` +3. **Build & push the image:** `./platform-engineer/build-and-push.sh` + (after `docker login git.rogi.casa`). +4. **Create the dedicated Discord bot**, invite it to your server, and put the + token + your channel id (base64) into `platform-engineer/secret.yaml`. Also + set the LiteLLM master key as `OPENAI_API_KEY` and a strong dashboard + password + a 32-byte session secret. +5. **Commit & push** the whole change. ArgoCD will create the namespace + resources, deploy the pod, and bring up the ingress at `hermes.rogi.casa`. +6. **Seed the cron jobs:** + `kubectl apply -f platform-engineer/cron-seed.yaml` (one-shot Job) — it waits + for the hermes pod, then runs `hermes cron create ...` for each watchdog. + Re-run it any time you want to re-seed after a wipe. +7. **Smoke test:** trigger the first health check manually — + `kubectl exec -n platform-engineer deploy/hermes -- hermes cron run cluster-health-check` — + and confirm the message lands in Discord. +8. **ArgoCD:** the `Application` (`argocd/apps/platform-engineer.yaml`) is + already generated. After commit, Argo will reconcile it like every other app. + +## 12. What ArgoCD owns vs. what is runtime state + +- **ArgoCD owns** (in git): namespace, RBAC, Secret, ConfigMap (seed), PVC, + Deployment, Service, Ingress, cron-seed Job. +- **Runtime state (on the PVC, NOT reconciled):** `config.yaml`, `SOUL.md`, + `.env`, `cron/jobs.json`, `sessions/`, `memories/`, `skills/`. The ConfigMap + only *seeds* these on first boot; after that, edits you make via the + dashboard or `hermes cron edit` persist on the PVC and Argo will not revert + them. If you ever want a hard reset, delete the PVC and re-apply. + +--- + +## Files in this directory + +| File | Purpose | +|------|---------| +| `namespace.yaml` | namespace `platform-engineer` | +| `rbac.yaml` | ServiceAccount + ClusterRole (+binding), least-privilege | +| `configmap.yaml` | seed `config.yaml` + `SOUL.md` | +| `secret.yaml` | Discord token, LiteLLM key, dashboard auth (PLACEHOLDERS — fill in) | +| `pvc.yaml` | 5 Gi PVC for `/opt/data` | +| `dockerfile` | derived image: hermes-agent + kubectl + helm (linux/amd64) | +| `build-and-push.sh` | builds & pushes the image to the Gitea registry | +| `deployment.yaml` | Deployment (1 replica, Recreate, pinned to amd64 NUC) + Service | +| `ingress.yaml` | `hermes.rogi.casa` → dashboard (TLS + basic auth) | +| `cron-seed.yaml` | one-shot Job that creates the Hermes cron schedule | + +Also changed outside this directory: +- `litellm/litellm.yaml` — added `qwen-3.6:27b` model entry. +- `argocd/gen-apps.sh` + `argocd/apps/platform-engineer.yaml` — ArgoCD + Application for this folder. +``` diff --git a/platform-engineer/build-and-push.sh b/platform-engineer/build-and-push.sh new file mode 100644 index 0000000..c9599cb --- /dev/null +++ b/platform-engineer/build-and-push.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +# Build & push the derived Hermes image (kubectl + helm) to the Gitea registry. +# +# Run this on a machine with docker + access to git.rogi.casa: +# ./platform-engineer/build-and-push.sh +# +# Prereqs: +# - docker login git.rogi.casa (use your Gitea username + access token) +set -euo pipefail + +REGISTRY="git.rogi.casa" +REPO="roger/hermes-agent" +TAG="${TAG:-v1.35-1}" +IMAGE="${REGISTRY}/${REPO}:${TAG}" + +cd "$(dirname "$0")" + +echo "==> Building ${IMAGE}" +docker build --platform linux/amd64 -t "${IMAGE}" -f dockerfile . + +echo "==> Pushing ${IMAGE}" +docker push "${IMAGE}" + +echo "==> Done. Update platform-engineer/deployment.yaml image: if you changed TAG." diff --git a/platform-engineer/configmap.yaml b/platform-engineer/configmap.yaml new file mode 100644 index 0000000..cad31d1 --- /dev/null +++ b/platform-engineer/configmap.yaml @@ -0,0 +1,115 @@ +# Hermes configuration, SOUL.md, and the cron-seed script. +# Seeded into the PVC (/opt/data) by the initContainer on first boot only. +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: hermes-seed + namespace: platform-engineer +data: + config.yaml: | + model: + provider: openai-api + default: qwen-3.6:27b + base_url: "https://litellm.rogi.casa/v1" + api_mode: chat_completions + + # Cheap/fast model for auxiliary tasks (titling, compression). + auxiliary: + compression: + provider: openai-api + model: qwen-3.6:27b + base_url: "https://litellm.rogi.casa/v1" + title_generation: + provider: openai-api + model: qwen-3.6:27b + base_url: "https://litellm.rogi.casa/v1" + + terminal: + backend: local + cwd: /workspace + timeout: 180 + home_mode: profile + + # Unattended gateway → circuit-break on stuck tool-call loops. + tool_loop_guardrails: + hard_stop_enabled: true + hard_stop_after: + exact_failure: 5 + idempotent_no_progress: 5 + + sessions: + auto_prune: true + retention_days: 90 + + cron: + wrap_response: false + + memory: + memory_enabled: true + user_profile_enabled: true + write_approval: false + + skills: + write_approval: false + + SOUL.md: | + # Platform Engineer — rogi.casa k3s cluster + + You are the autonomous Platform Engineer for the `rogi.casa` K3s cluster. + You run *inside* the cluster (namespace `platform-engineer`) and your job is + to keep it healthy, fix small problems before they grow, and notify your + owner (Roger) on Discord when something needs a human. + + ## The cluster you look after + + - **Nodes:** + - `raspberrypi` — control-plane, arm64 (4 GiB) + - `rpi2` — worker, arm, very low memory (~512 MiB) + - `roger-nucbox-evo-x2` — worker, amd64, 24 GiB (you run here) + - **GitOps:** ArgoCD owns every app from `https://git.rogi.casa/roger/k3s-cluster.git`. + Each app lives in its own folder; manifests are reconciled with prune + selfHeal. + - **Ingress:** Traefik; TLS via cert-manager + `letsencrypt-prod` Cloudflare Origin issuer. + - **LLM gateway:** LiteLLM at `https://litellm.rogi.casa/v1` — this is *your* model provider (you reach it through the Traefik ingress, never Ollama directly). + - **Services:** glance, pihole, litellm, gitea, home-assistant, jellyfin, n8n, + openwebui, phoenix, vaultwarden, qbittorrent, minecraft, monitoring + (prometheus + grafana), fava, myorg-assistant, gym-tracker, nas-proxy. + - **Your own RBAC** lets you read almost everything and mutate only an + allowlist (restart deployments/statefulsets/daemonsets, delete a stuck pod, + delete/patch jobs/cronjobs, `kubectl exec`). You CANNOT edit RBAC, taint + nodes, create/delete namespaces, or touch CRDs — if you think you need to, + propose the command to Roger and stop. + + ## Operating rules + + 1. **Read first, act second.** Before changing anything, gather the evidence: + `kubectl describe`, `kubectl logs`, `kubectl get events --since=...`, + `kubectl top`. Cite the exact resource (ns/name) and the exact command in + every report. + 2. **Only safe, idempotent remediations.** Allowed actions: + - `kubectl rollout restart deployment/ -n ` (and statefulset/daemonset) + - delete a single stuck `CrashLoopBackOff`/`ImagePullBackOff` pod so its + controller recreates it + - `kubectl delete job/` / `kubectl patch cronjob ...` + Never run a command that affects more than one workload at a time unless + Roger asked for it. + 3. **When in doubt, notify, don't act.** If a fix is risky, unusual, or would + touch state you can't reach (RBAC, nodes, CRDs, PVC data), post the + proposed command to Discord and wait for Roger to reply. + 4. **Be quiet when healthy.** Watchdog cron jobs reply with exactly `[SILENT]` + when there is nothing to report. Failed jobs always deliver regardless. + 5. **No runaway loops.** You cannot create new cron jobs from inside a cron run + (Hermes disables that). Do not try. + 6. **Talk like an engineer.** Short, concrete, with resource names and + commands. No filler. When you fixed something, say what you did in one line. + 7. **Respect GitOps.** If an app is `OutOfSync`/`Degraded` in ArgoCD, do not + hand-edit resources to "fix" it — Argo will revert you. Report it so Roger + can fix the source repo. + + ## How you reach Roger + + Notifications go to Discord (your home channel). Cron jobs deliver there by + default (`deliver="discord"`). Keep messages under ~1800 chars; attach + longer logs as `kubectl logs ... > /opt/data/cron/output/` and link + the path. + ``` diff --git a/platform-engineer/cron-seed.yaml b/platform-engineer/cron-seed.yaml new file mode 100644 index 0000000..c3876ad --- /dev/null +++ b/platform-engineer/cron-seed.yaml @@ -0,0 +1,74 @@ +# One-shot Job that seeds Hermes' built-in cron schedule on first install. +# Idempotent: skips job names that already exist. +# +# The agent's own cron jobs live in /opt/data/cron/jobs.json on the PVC and are +# NOT reconciled by ArgoCD (runtime state). Re-run this Job manually after a +# wipe to re-seed: kubectl job restart hermes-cron-seed -n platform-engineer +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: hermes-cron-seed + namespace: platform-engineer + labels: + app: hermes +spec: + backoffLimit: 4 + ttlSecondsAfterFinished: 86400 + template: + metadata: + labels: + app: hermes + spec: + serviceAccountName: platform-engineer + restartPolicy: OnFailure + containers: + - name: seed + image: bitnami/kubectl:1.35 + command: ["sh", "-c"] + args: + - | + set -e + echo "Waiting for hermes pod to be Ready..." + kubectl -n platform-engineer wait --for=condition=Ready pod -l app=hermes --timeout=300s || true + + POD=$(kubectl -n platform-engineer get pod -l app=hermes -o jsonpath='{.items[0].metadata.name}') + echo "Using pod: $POD" + + exists() { kubectl -n platform-engineer exec "$POD" -- hermes cron list 2>/dev/null | grep -qi "name=$1\| $1 "; } + + create() { + name="$1"; schedule="$2"; deliver="$3"; prompt="$4" + if exists "$name"; then + echo "cron job '$name' already exists — skipping" + else + echo "creating cron job '$name' ..." + kubectl -n platform-engineer exec "$POD" -- hermes cron create "$schedule" "$prompt" --name "$name" --deliver "$deliver" + fi + } + + # ---- Watchdog checks (silent unless something is wrong) ---- + create "cluster-health-check" "every 15m" "discord" \ + "Run: kubectl get nodes; kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded; kubectl get events -A --field-selector type=Warning --since=20m. If everything is healthy and there are no Warning events, reply with exactly [SILENT]. Otherwise give a concise per-resource summary of what is wrong (node name, pod ns/name, phase, last event)." + + create "pod-restart-loop" "every 10m" "discord" \ + "Find pods in CrashLoopBackOff or ImagePullBackOff across all namespaces (kubectl get pods -A). For each, fetch kubectl logs (previous) and describe. If the cause is clearly transient (OOM kill, a one-off config parse error that will retry cleanly, a missing Secret the controller will recreate), attempt ONE safe remediation: kubectl rollout restart of the owning Deployment/StatefulSet/DaemonSet, OR delete the single stuck pod. Report what you did in one line per resource. If the cause is not clearly transient (bad image, missing config, auth failure), do NOT act — post the log excerpt and the proposed command and wait for Roger. If no such pods exist, reply [SILENT]." + + create "pvc-pressure" "every 30m" "discord" \ + "Check cluster storage health: kubectl get pv,pvc -A; kubectl top nodes. Alert if any PVC is Pending/Lost or any node filesystem usage is over 85%. If all healthy, reply [SILENT]." + + create "argocd-sync-health" "every 1h" "discord" \ + "Run: kubectl get applications -n argocd -o custom-columns=NAME:.metadata.name,SYNC:.status.sync.status,HEALTH:.status.health.status. If every app is Synced and Healthy, reply [SILENT]. Otherwise list the OutOfSync/Degraded apps with their status. Do NOT hand-edit resources to fix them (Argo will revert) — just report." + + create "cert-expiry" "0 9 * * *" "discord" \ + "List all cert-manager Certificate resources (kubectl get certificates -A). For each, check notAfter. Alert on any certificate expiring in under 21 days. If none, reply [SILENT]." + + create "node-resource-drift" "every 30m" "discord" \ + "Run kubectl top nodes. If any node CPU or memory usage is over 90%, or any node is NotReady, report it with the numbers. Otherwise reply [SILENT]." + + # ---- Daily report (always delivered) ---- + create "daily-cluster-report" "0 8 * * *" "discord" \ + "Produce a daily cluster report for Roger: (1) node count + Ready/NotReady; (2) top 5 pods by CPU and by memory across all namespaces (kubectl top pods -A --sort-by); (3) count of pods not Running; (4) ArgoCD apps OutOfSync or Degraded; (5) any certificates expiring within 30 days; (6) any recent Warning events (last 24h). Keep it under 1800 chars. Always deliver (no [SILENT])." + + echo "Done. Listing all cron jobs:" + kubectl -n platform-engineer exec "$POD" -- hermes cron list diff --git a/platform-engineer/deployment.yaml b/platform-engineer/deployment.yaml new file mode 100644 index 0000000..8fe3de8 --- /dev/null +++ b/platform-engineer/deployment.yaml @@ -0,0 +1,134 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: hermes + namespace: platform-engineer + labels: + app: hermes +spec: + replicas: 1 # MUST be 1 — Hermes' /opt/data is single-writer. + strategy: + type: Recreate # never run two pods against the same PVC + selector: + matchLabels: + app: hermes + template: + metadata: + labels: + app: hermes + spec: + serviceAccountName: platform-engineer + imagePullSecrets: + - name: gitea-registry + + # Pin to the powerful amd64 node (image is linux/amd64; the NUC has 24 GiB). + nodeSelector: + kubernetes.io/arch: amd64 + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: hardware + operator: In + values: ["high-memory"] + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app: hermes + topologyKey: kubernetes.io/hostname + + initContainers: + # Seed /opt/data with config.yaml + SOUL.md on first boot only. + # ArgoCD owns the manifests; the PVC is runtime state and is NOT reconciled. + - name: seed-data + image: busybox:1.36 + command: ["sh", "-c"] + args: + - | + set -e + if [ ! -f /opt/data/config.yaml ]; then + echo "First boot: seeding /opt/data from ConfigMap..." + cp /seed/config.yaml /opt/data/config.yaml + cp /seed/SOUL.md /opt/data/SOUL.md + chmod 600 /opt/data/config.yaml + else + echo "/opt/data already initialized — leaving runtime state intact." + fi + mkdir -p /opt/data/home/.kube /opt/data/cron/output /opt/data/scripts /workspace + volumeMounts: + - name: data + mountPath: /opt/data + - name: seed + mountPath: /seed + + containers: + - name: hermes + image: git.rogi.casa/roger/hermes-agent:v1.35-1 + imagePullPolicy: Always + command: ["gateway", "run"] + ports: + - name: gateway + containerPort: 8642 + - name: dashboard + containerPort: 9119 + envFrom: + - secretRef: + name: hermes-env + env: + # k3s injects these automatically; kubectl inside the pod uses the SA token. + - name: HERMES_HOME + value: /opt/data + volumeMounts: + - name: data + mountPath: /opt/data + - name: workspace + mountPath: /workspace + resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "2Gi" + cpu: "1000m" + livenessProbe: + httpGet: + path: /health + port: 8642 + initialDelaySeconds: 60 + periodSeconds: 30 + failureThreshold: 3 + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: false # official image runs as root for s6 init then drops to hermes + + volumes: + - name: data + persistentVolumeClaim: + claimName: hermes-data + - name: workspace + emptyDir: {} + - name: seed + configMap: + name: hermes-seed +--- +apiVersion: v1 +kind: Service +metadata: + name: hermes + namespace: platform-engineer +spec: + type: ClusterIP + selector: + app: hermes + ports: + - name: gateway + port: 80 + targetPort: 8642 + - name: dashboard + port: 9119 + targetPort: 9119 diff --git a/platform-engineer/dockerfile b/platform-engineer/dockerfile new file mode 100644 index 0000000..06c4409 --- /dev/null +++ b/platform-engineer/dockerfile @@ -0,0 +1,31 @@ +# Derived Hermes Agent image with kubectl + helm so the agent can drive the +# k3s cluster from inside the container (terminal backend = local). +# +# Build & push to the Gitea registry: +# docker build -t git.rogi.casa/roger/hermes-agent:v1.35-1 -f dockerfile . +# docker push git.rogi.casa/roger/hermes-agent:v1.35-1 +# +# This image targets linux/amd64 (the agent pod is pinned to the amd64 NUC). +FROM nousresearch/hermes-agent:latest + +USER root + +# kubectl (v1.35 to match the cluster's k3s version) +RUN apt-get update \ + && apt-get install -y --no-install-recommends curl gnupg ca-certificates \ + && curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.35/deb/Release.key \ + | gpg --dearmor -o /usr/share/keyrings/kubernetes-apt-keyring.gpg \ + && echo 'deb [signed-by=/usr/share/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.35/deb/ /' \ + > /etc/apt/sources.list.d/kubernetes.list \ + && apt-get update \ + && apt-get install -y --no-install-recommends kubectl \ + # helm + && curl -fsSL https://get.helm.sh/helm-v3.16.3-linux-amd64.tar.gz \ + | tar -xz -C /usr/local/bin --strip-components=1 linux-amd64/helm \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Hermes' own CLI/kubeconfig helper dir for tool subprocesses +RUN mkdir -p /opt/data/home/.kube + +USER hermes diff --git a/platform-engineer/ingress.yaml b/platform-engineer/ingress.yaml new file mode 100644 index 0000000..ff447fe --- /dev/null +++ b/platform-engineer/ingress.yaml @@ -0,0 +1,24 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: hermes + namespace: platform-engineer + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod +spec: + ingressClassName: traefik + tls: + - hosts: + - hermes.rogi.casa + secretName: hermes-tls + rules: + - host: hermes.rogi.casa + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: hermes + port: + number: 9119 # dashboard diff --git a/platform-engineer/namespace.yaml b/platform-engineer/namespace.yaml new file mode 100644 index 0000000..ed57f3f --- /dev/null +++ b/platform-engineer/namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: platform-engineer diff --git a/platform-engineer/pvc.yaml b/platform-engineer/pvc.yaml new file mode 100644 index 0000000..12ecb25 --- /dev/null +++ b/platform-engineer/pvc.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: hermes-data + namespace: platform-engineer +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 5Gi diff --git a/platform-engineer/rbac.yaml b/platform-engineer/rbac.yaml new file mode 100644 index 0000000..e03b954 --- /dev/null +++ b/platform-engineer/rbac.yaml @@ -0,0 +1,111 @@ +# Least-privilege RBAC for the Platform Engineer Hermes agent. +# +# The agent can READ almost everything cluster-wide, but can only MUTATE a +# narrow allowlist of safe, idempotent resources (restart deployments, delete a +# stuck pod so its controller recreates it, etc.). It CANNOT touch RBAC, nodes, +# namespaces, CRDs, or other namespaces' Secrets beyond read. +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: platform-engineer + namespace: platform-engineer +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: platform-engineer +rules: + # ---- Broad read access (cluster-wide) ---- + - apiGroups: [""] + resources: + - nodes + - nodes/proxy + - services + - endpoints + - pods + - pods/log + - configmaps + - secrets + - persistentvolumeclaims + - persistentvolumes + - namespaces + - events + - replicationcontrollers + verbs: ["get", "list", "watch"] + - apiGroups: ["apps"] + resources: + - deployments + - statefulsets + - daemonsets + - replicasets + verbs: ["get", "list", "watch"] + - apiGroups: ["batch"] + resources: + - jobs + - cronjobs + verbs: ["get", "list", "watch"] + - apiGroups: ["networking.k8s.io"] + resources: + - ingresses + verbs: ["get", "list", "watch"] + - apiGroups: ["autoscaling"] + resources: + - horizontalpodautoscalers + verbs: ["get", "list", "watch"] + - apiGroups: ["argoproj.io"] + resources: + - applications + - appprojects + verbs: ["get", "list", "watch"] + - apiGroups: ["cert-manager.io"] + resources: + - certificates + - certificaterequests + - clusterissuers + verbs: ["get", "list", "watch"] + - apiGroups: ["metrics.k8s.io"] + resources: + - pods + - nodes + verbs: ["get", "list"] + + # ---- Metrics / health endpoints ---- + - nonResourceURLs: ["/metrics", "/metrics/*"] + verbs: ["get"] + + # ---- Narrow mutate allowlist (idempotent, safe remediation) ---- + # Restart a stuck pod by deleting it (its controller recreates it). + - apiGroups: [""] + resources: ["pods"] + verbs: ["delete", "patch"] + # `kubectl rollout restart` and scaling for the apps/batch controllers. + - apiGroups: ["apps"] + resources: + - deployments + - statefulsets + - daemonsets + - replicasets + verbs: ["patch", "update"] + - apiGroups: ["batch"] + resources: + - jobs + - cronjobs + verbs: ["patch", "update", "delete"] + # Exec into pods for log-style / debug inspection (granted per request #5). + - apiGroups: [""] + resources: ["pods/exec"] + verbs: ["create"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: platform-engineer +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: platform-engineer +subjects: + - kind: ServiceAccount + name: platform-engineer + namespace: platform-engineer