diff --git a/platform-engineer/cron-seed.yaml b/platform-engineer/cron-seed.yaml index c3876ad..ddaa364 100644 --- a/platform-engineer/cron-seed.yaml +++ b/platform-engineer/cron-seed.yaml @@ -24,18 +24,37 @@ spec: restartPolicy: OnFailure containers: - name: seed - image: bitnami/kubectl:1.35 + # alpine is tiny and always available; we install curl + download the + # right-arch kubectl binary at runtime (bitnami/kubectl tags are + # inconsistent across versions, so we avoid depending on them). + image: alpine:3.20 command: ["sh", "-c"] args: - | set -e + + # Install curl, then download kubectl for this node's architecture. + apk add --no-cache curl + ARCH=$(uname -m) + case "$ARCH" in + x86_64) KARCH=amd64 ;; + aarch64) KARCH=arm64 ;; + armv7l) KARCH=arm ;; + *) echo "unsupported arch: $ARCH" >&2; exit 1 ;; + esac + echo "Downloading kubectl for linux/$KARCH ..." + curl -fsSL -o /usr/local/bin/kubectl \ + "https://dl.k8s.io/release/v1.35.0/bin/linux/${KARCH}/kubectl" + chmod +x /usr/local/bin/kubectl + kubectl version --client + echo "Waiting for hermes pod to be Ready..." kubectl -n platform-engineer wait --for=condition=Ready pod -l app=hermes --timeout=300s || true POD=$(kubectl -n platform-engineer get pod -l app=hermes -o jsonpath='{.items[0].metadata.name}') echo "Using pod: $POD" - exists() { kubectl -n platform-engineer exec "$POD" -- hermes cron list 2>/dev/null | grep -qi "name=$1\| $1 "; } + exists() { kubectl -n platform-engineer exec "$POD" -- hermes cron list 2>/dev/null | grep -qi " $1 "; } create() { name="$1"; schedule="$2"; deliver="$3"; prompt="$4" diff --git a/platform-engineer/deployment.yaml b/platform-engineer/deployment.yaml index f84c7b9..6770e9f 100644 --- a/platform-engineer/deployment.yaml +++ b/platform-engineer/deployment.yaml @@ -128,12 +128,17 @@ spec: memory: "2Gi" cpu: "1000m" livenessProbe: - httpGet: - path: /health - port: 8642 - initialDelaySeconds: 60 + # Probe the dashboard port (9119, always enabled via HERMES_DASHBOARD=1 + # and binds 0.0.0.0). The gateway API on 8642 is off by default + # (API_SERVER_ENABLED not set), so 9119 is the reliable liveness signal. + # s6 auto-restarts the gateway itself; this probe only catches a wedged + # container. + tcpSocket: + port: 9119 + initialDelaySeconds: 90 periodSeconds: 30 - failureThreshold: 3 + timeoutSeconds: 5 + failureThreshold: 5 securityContext: allowPrivilegeEscalation: false