fix hermes liveness probe

This commit is contained in:
Roger Oriol
2026-06-28 00:43:09 +02:00
parent 4d9195b32d
commit 734962d198
2 changed files with 31 additions and 7 deletions

View File

@@ -24,18 +24,37 @@ spec:
restartPolicy: OnFailure
containers:
- name: seed
image: bitnami/kubectl:1.35
# alpine is tiny and always available; we install curl + download the
# right-arch kubectl binary at runtime (bitnami/kubectl tags are
# inconsistent across versions, so we avoid depending on them).
image: alpine:3.20
command: ["sh", "-c"]
args:
- |
set -e
# Install curl, then download kubectl for this node's architecture.
apk add --no-cache curl
ARCH=$(uname -m)
case "$ARCH" in
x86_64) KARCH=amd64 ;;
aarch64) KARCH=arm64 ;;
armv7l) KARCH=arm ;;
*) echo "unsupported arch: $ARCH" >&2; exit 1 ;;
esac
echo "Downloading kubectl for linux/$KARCH ..."
curl -fsSL -o /usr/local/bin/kubectl \
"https://dl.k8s.io/release/v1.35.0/bin/linux/${KARCH}/kubectl"
chmod +x /usr/local/bin/kubectl
kubectl version --client
echo "Waiting for hermes pod to be Ready..."
kubectl -n platform-engineer wait --for=condition=Ready pod -l app=hermes --timeout=300s || true
POD=$(kubectl -n platform-engineer get pod -l app=hermes -o jsonpath='{.items[0].metadata.name}')
echo "Using pod: $POD"
exists() { kubectl -n platform-engineer exec "$POD" -- hermes cron list 2>/dev/null | grep -qi "name=$1\| $1 "; }
exists() { kubectl -n platform-engineer exec "$POD" -- hermes cron list 2>/dev/null | grep -qi " $1 "; }
create() {
name="$1"; schedule="$2"; deliver="$3"; prompt="$4"

View File

@@ -128,12 +128,17 @@ spec:
memory: "2Gi"
cpu: "1000m"
livenessProbe:
httpGet:
path: /health
port: 8642
initialDelaySeconds: 60
# Probe the dashboard port (9119, always enabled via HERMES_DASHBOARD=1
# and binds 0.0.0.0). The gateway API on 8642 is off by default
# (API_SERVER_ENABLED not set), so 9119 is the reliable liveness signal.
# s6 auto-restarts the gateway itself; this probe only catches a wedged
# container.
tcpSocket:
port: 9119
initialDelaySeconds: 90
periodSeconds: 30
failureThreshold: 3
timeoutSeconds: 5
failureThreshold: 5
securityContext:
allowPrivilegeEscalation: false