From 69cb269230b1007885097fab4049dab9d9787f66 Mon Sep 17 00:00:00 2001 From: Mark Burnett Date: Thu, 30 Aug 2018 12:18:06 -0500 Subject: [PATCH] Make K8S proxy health check more aggressive In K8S version 1.10, the proxy can sometimes get stuck believing that some services do not have any endpoints. This seems to be triggered by network instability, though the proxy doesn't seem to recover on its own, while bouncing the pod fixes the issue. This change adds a naive means of detecting and recoverying from this (`iptables-save | grep 'has no endpoints'` in the liveness probe) that may occasionally have false positives. As such, the liveness probe is configured very conservatively to avoid triggering CrashLoopBackoff in the event of a false positive. Finally, there is a whitelist feature to help avoid false positives for services that are known to legitimately have empty endpoints during the course of normal operation (e.g. Patroni might manage such an endpoint list). Change-Id: I29a770fab70b1fb79db59ef5408f40b2af1c01f9 --- .../templates/bin/_liveness-probe.sh.tpl | 26 ++++++++++++++++++ .../templates/bin/_readiness-probe.sh.tpl | 5 ++++ charts/proxy/templates/configmap-bin.yaml | 26 ++++++++++++++++++ charts/proxy/templates/daemonset.yaml | 27 +++++++++---------- charts/proxy/values.yaml | 14 ++++++++++ 5 files changed, 84 insertions(+), 14 deletions(-) create mode 100644 charts/proxy/templates/bin/_liveness-probe.sh.tpl create mode 100644 charts/proxy/templates/bin/_readiness-probe.sh.tpl create mode 100644 charts/proxy/templates/configmap-bin.yaml diff --git a/charts/proxy/templates/bin/_liveness-probe.sh.tpl b/charts/proxy/templates/bin/_liveness-probe.sh.tpl new file mode 100644 index 00000000..c00266aa --- /dev/null +++ b/charts/proxy/templates/bin/_liveness-probe.sh.tpl @@ -0,0 +1,26 @@ +#!/bin/bash + +set -e + +FAILURE=0 +{{- if .Values.livenessProbe.whitelist }} +WHITELIST='({{- join "|" .Values.livenessProbe.whitelist -}})' +{{- end }} + +REQUEST='GET /healthz HTTP/1.0\r\nHost: localhost:10256\r\n' + +if [[ $(echo -e "${REQUEST}" | socat - TCP4:localhost:10256 | grep -sc '200 OK') -lt 1 ]]; then + echo Failed proxy built-in HTTP health check. + echo -e "${REQUEST}" | socat - TCP4:localhost:10256 + FAILURE=1 +fi + +if [[ $(iptables-save {{- if .Values.livenessProbe.whitelist }} | grep -Ev "${WHITELIST}" {{- end }} | grep -sc 'has no endpoints') -gt 0 ]]; then + echo Some non-whitelisted services have no endpoints: + iptables-save | grep 'has no endpoints' + FAILURE=1 +fi + +if [[ "${FAILURE}" == "1" ]]; then + exit 1 +fi diff --git a/charts/proxy/templates/bin/_readiness-probe.sh.tpl b/charts/proxy/templates/bin/_readiness-probe.sh.tpl new file mode 100644 index 00000000..3f87b48f --- /dev/null +++ b/charts/proxy/templates/bin/_readiness-probe.sh.tpl @@ -0,0 +1,5 @@ +#!/bin/bash + +set -e + +iptables-save | grep 'default/kubernetes:https' diff --git a/charts/proxy/templates/configmap-bin.yaml b/charts/proxy/templates/configmap-bin.yaml new file mode 100644 index 00000000..8b98721c --- /dev/null +++ b/charts/proxy/templates/configmap-bin.yaml @@ -0,0 +1,26 @@ +{{/* +# Copyright (c) 2018 AT&T Intellectual Property. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. */}} + +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: kubernetes-proxy-bin +data: + liveness-probe.sh: | +{{ tuple "bin/_liveness-probe.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} + readiness-probe.sh: | +{{ tuple "bin/_readiness-probe.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} +... diff --git a/charts/proxy/templates/daemonset.yaml b/charts/proxy/templates/daemonset.yaml index 1af7df6b..ac77e107 100644 --- a/charts/proxy/templates/daemonset.yaml +++ b/charts/proxy/templates/daemonset.yaml @@ -63,24 +63,23 @@ spec: - name: KUBERNETES_SERVICE_PORT value: {{ .Values.kube_service.port | quote }} livenessProbe: - httpGet: - host: 127.0.0.1 - path: /healthz - port: 10256 - failureThreshold: 3 - initialDelaySeconds: 15 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 5 +{{ toYaml .Values.livenessProbe.config | indent 10 }} + exec: + command: + - /tmp/bin/liveness-probe.sh readinessProbe: exec: command: - - sh - - -c - - |- - set -ex - iptables-save | grep 'default/kubernetes:https' + - /tmp/bin/readiness-probe.sh initialDelaySeconds: 15 periodSeconds: 15 + volumeMounts: + - name: bin + mountPath: /tmp/bin/ serviceAccountName: kube-proxy + volumes: + - name: bin + configMap: + name: kubernetes-proxy-bin + defaultMode: 0555 {{- end }} diff --git a/charts/proxy/values.yaml b/charts/proxy/values.yaml index 233467f8..18d1f2df 100644 --- a/charts/proxy/values.yaml +++ b/charts/proxy/values.yaml @@ -55,3 +55,17 @@ network: kube_service: host: 127.0.0.1 port: 6553 + +livenessProbe: + config: + # NOTE(mark-burnett): To avoid cascading failure modes, it is + # important that these values are configured to avoid the possibility + # of CrashLoopBackoff for this pod. Otherwise, a small non-impacting + # issue could disable kube-proxy for the entire site. + failureThreshold: 10 + initialDelaySeconds: 15 + periodSeconds: 35 + successThreshold: 1 + timeoutSeconds: 10 + whitelist: + # - postgres