From 6b2c7245de93fe3d2c4e02a2fbc89e21f2816b38 Mon Sep 17 00:00:00 2001 From: Phil Sphicas Date: Sun, 19 Apr 2020 01:35:09 -0700 Subject: [PATCH] Reestablish watch and retry wait for some errors Armada uses a Kubernetes watch to implement its chart wait logic. This can be a fairly long-lived connection to the Kubernetes API server, and is vulnerable to disruption (if, for example, the kubernetes apiserver chart is being upgraded). This change allows Armada to retry the wait for some specific errors, including the establishment of a new watch, until the overall chart timeout is reached. https://github.com/kubernetes-client/python/issues/972 urllib3.exceptions.ProtocolError: ('Connection broken: IncompleteRead(0 bytes read)', IncompleteRead(0 bytes read)) Change-Id: I3e68a54becadd5b2a2343960a120bdc3de8e8515 --- armada/handlers/wait.py | 12 ++++++++++++ requirements.txt | 1 + 2 files changed, 13 insertions(+) diff --git a/armada/handlers/wait.py b/armada/handlers/wait.py index 57790259..e2c156f7 100644 --- a/armada/handlers/wait.py +++ b/armada/handlers/wait.py @@ -21,6 +21,8 @@ import time from kubernetes import watch from oslo_log import log as logging +from retry import retry +import urllib3.exceptions from armada import const from armada.exceptions import k8s_exceptions @@ -318,6 +320,16 @@ class ResourceWait(ABC): else: self._wait(deadline) + # The Kubernetes Python Client does not always recover from broken + # connections to the k8s apiserver, and the resulting uncaught exceptions + # in the Watch.stream method cause the chart installation to fail. As long + # as the wait deadline has not passed, it is better to retry the entire + # wait operation. + @retry( + exceptions=( + urllib3.exceptions.ProtocolError, + urllib3.exceptions.MaxRetryError), + delay=1) def _wait(self, deadline): ''' Waits for resources to become ready. diff --git a/requirements.txt b/requirements.txt index f1416ea2..07b636a5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,6 +10,7 @@ PasteDeploy>=1.5.2 protobuf>=3.4.0 PyYAML==3.12 requests +retry prometheus_client==0.7.0 # API