From 6bc384726509e430a36aa0eb97a73cad7586e581 Mon Sep 17 00:00:00 2001 From: Sean Eagan Date: Wed, 20 Nov 2019 10:34:16 -0600 Subject: [PATCH] Fix coredns checks The readiness probe and helm test currently rely on the return code of `dig`, which doesn't fail for NXDOMAIN, which means they are not checking that the names are actually resolved. This moves to using `host` instead which does check this. This also removes the checks for kubernetes etcd domain names, since that doesn't get deployed until after coredns. Change-Id: I0b459f52663c936ed4b8b216614c5b4824a0713f --- charts/coredns/templates/bin/_probe.py.tpl | 22 +++++++++++++---- charts/coredns/templates/configmap-list.yaml | 2 +- charts/coredns/templates/etc/_list.tpl | 4 +-- charts/coredns/templates/pod-test.yaml | 26 ++++++++++++++------ examples/basic/armada-resources.yaml | 1 - examples/complete/armada-resources.yaml | 1 - examples/containerd/armada-resources.yaml | 1 - examples/gate/armada-resources.yaml | 1 - 8 files changed, 38 insertions(+), 20 deletions(-) diff --git a/charts/coredns/templates/bin/_probe.py.tpl b/charts/coredns/templates/bin/_probe.py.tpl index 9e3ac22f..d7e6ed0f 100644 --- a/charts/coredns/templates/bin/_probe.py.tpl +++ b/charts/coredns/templates/bin/_probe.py.tpl @@ -17,12 +17,24 @@ class httpHandler(BaseHTTPRequestHandler): failed = False res = requests.get("http://127.0.0.1:{}/health".format(args.check_port)) if res.status_code >= 400: + print('Failed /health check, status code = : {}'.format(res.status_code)) failed = True - res = subprocess.run( - ["dig", "+time=2", "+tries=1", "@127.0.0.1", "-f", args.filename], - stdout=subprocess.DEVNULL) - if res.returncode != 0: - failed = True + + with open(args.filename, 'r') as fh: + for host in fh.read().splitlines(): + # ignore blank lines + if not host: + continue + res = subprocess.run( + ["host", "-W=2", "-R=1", host, "127.0.0.1"], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + if res.returncode != 0: + print('Failed to resolve host: "{}"'.format(host)) + print(res.stdout) + failed = True + break + if failed: print('Check failed') self.send_response(500) diff --git a/charts/coredns/templates/configmap-list.yaml b/charts/coredns/templates/configmap-list.yaml index d3729944..88a0a869 100644 --- a/charts/coredns/templates/configmap-list.yaml +++ b/charts/coredns/templates/configmap-list.yaml @@ -4,5 +4,5 @@ kind: ConfigMap metadata: name: {{ .Values.service.name }}-list data: - names_to_resolve: | + names_to_resolve: |- {{ tuple "etc/_list.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} diff --git a/charts/coredns/templates/etc/_list.tpl b/charts/coredns/templates/etc/_list.tpl index 8df545b9..9fbaae00 100644 --- a/charts/coredns/templates/etc/_list.tpl +++ b/charts/coredns/templates/etc/_list.tpl @@ -1,3 +1,3 @@ -{{- range .Values.conf.test.names_to_resolve }} +{{- range .Values.conf.test.names_to_resolve -}} {{ . }} -{{- end }} +{{ end -}} diff --git a/charts/coredns/templates/pod-test.yaml b/charts/coredns/templates/pod-test.yaml index d456281b..6d7b5235 100644 --- a/charts/coredns/templates/pod-test.yaml +++ b/charts/coredns/templates/pod-test.yaml @@ -40,16 +40,26 @@ spec: - -c - | SUCCESS=1 - {{- range .Values.conf.test.names_to_resolve }} - if dig {{ . }}; then - echo "Successfully resolved {{ . }}" - else - echo "Failed to resolve {{ . }}" - SUCCESS=0 - fi - {{- end }} + while read host; do + if [ -n "$host" ]; then + if host "$host"; then + echo "Successfully resolved: \"$host\"" + else + echo "Failed to resolve: \"$host\"" + SUCCESS=0 + fi + fi + done < /tmp/etc/names_to_resolve if [ "$SUCCESS" != "1" ]; then echo "Test failed to resolve all names." exit 1 fi + volumeMounts: + - name: dns-names + mountPath: /tmp/etc + volumes: + - name: dns-names + configMap: + name: {{ $envAll.Values.service.name }}-list + defaultMode: 0555 {{- end }} diff --git a/examples/basic/armada-resources.yaml b/examples/basic/armada-resources.yaml index ec59d033..b2a3c3b8 100644 --- a/examples/basic/armada-resources.yaml +++ b/examples/basic/armada-resources.yaml @@ -563,7 +563,6 @@ data: test: names_to_resolve: - calico-etcd.kube-system.svc.cluster.local - - kubernetes-etcd.kube-system.svc.cluster.local - kubernetes.default.svc.cluster.local images: tags: diff --git a/examples/complete/armada-resources.yaml b/examples/complete/armada-resources.yaml index ed9d9170..1cf862c1 100644 --- a/examples/complete/armada-resources.yaml +++ b/examples/complete/armada-resources.yaml @@ -584,7 +584,6 @@ data: test: names_to_resolve: - calico-etcd.kube-system.svc.cluster.local - - kubernetes-etcd.kube-system.svc.cluster.local - kubernetes.default.svc.cluster.local images: diff --git a/examples/containerd/armada-resources.yaml b/examples/containerd/armada-resources.yaml index 338b8cf7..f5eeb510 100644 --- a/examples/containerd/armada-resources.yaml +++ b/examples/containerd/armada-resources.yaml @@ -438,7 +438,6 @@ data: - att.com - calico-etcd.kube-system.svc.cluster.local - google.com - - kubernetes-etcd.kube-system.svc.cluster.local - kubernetes.default.svc.cluster.local images: diff --git a/examples/gate/armada-resources.yaml b/examples/gate/armada-resources.yaml index d3b2d16f..a75465a5 100644 --- a/examples/gate/armada-resources.yaml +++ b/examples/gate/armada-resources.yaml @@ -459,7 +459,6 @@ data: - att.com - calico-etcd.kube-system.svc.cluster.local - google.com - - kubernetes-etcd.kube-system.svc.cluster.local - kubernetes.default.svc.cluster.local images: