diff --git a/armada/handlers/wait.py b/armada/handlers/wait.py index 57790259..e2c156f7 100644 --- a/armada/handlers/wait.py +++ b/armada/handlers/wait.py @@ -21,6 +21,8 @@ import time from kubernetes import watch from oslo_log import log as logging +from retry import retry +import urllib3.exceptions from armada import const from armada.exceptions import k8s_exceptions @@ -318,6 +320,16 @@ class ResourceWait(ABC): else: self._wait(deadline) + # The Kubernetes Python Client does not always recover from broken + # connections to the k8s apiserver, and the resulting uncaught exceptions + # in the Watch.stream method cause the chart installation to fail. As long + # as the wait deadline has not passed, it is better to retry the entire + # wait operation. + @retry( + exceptions=( + urllib3.exceptions.ProtocolError, + urllib3.exceptions.MaxRetryError), + delay=1) def _wait(self, deadline): ''' Waits for resources to become ready. diff --git a/requirements.txt b/requirements.txt index f1416ea2..07b636a5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,6 +10,7 @@ PasteDeploy>=1.5.2 protobuf>=3.4.0 PyYAML==3.12 requests +retry prometheus_client==0.7.0 # API