From 2310ddbc2cd7b676ea171ecb88b317f44bc1d05b Mon Sep 17 00:00:00 2001 From: Sean Eagan Date: Wed, 16 Jan 2019 14:10:10 -0600 Subject: [PATCH] Remediate releases stuck in non-DEPLOYED statuses Armada remediates releases stuck in FAILED status, if not protected, by purging and re-installing them. This implements the same for other non-DEPLOYED statuses. For these statuses it guards this with a best effort determination of whether a previous deployment of the release, either through armada or the helm CLI, is likely still pending based on whether it was last deployed within the chart's wait timeout. If it is deemed likely pending an error is raised, however this condition will eventually expire on future runs allowing for eventual remediation. Reasons why a release may get stuck in statuses other than DEPLOYED or FAILED include: 1. tiller crashed mid-deployment 2. tiller could not reach kubernetes to update the release state 3. running `helm delete ` (without --purge) (DELETED status) Change-Id: Ia89cd59f056103dde47980a149c07a2984c4bbb4 --- armada/exceptions/armada_exceptions.py | 28 +++--- armada/handlers/chart_deploy.py | 86 ++++++++++++------- armada/utils/release.py | 16 ++++ .../exceptions/armada-exceptions.inc | 2 +- 4 files changed, 89 insertions(+), 43 deletions(-) diff --git a/armada/exceptions/armada_exceptions.py b/armada/exceptions/armada_exceptions.py index 052a66e6..4438e9c0 100644 --- a/armada/exceptions/armada_exceptions.py +++ b/armada/exceptions/armada_exceptions.py @@ -31,14 +31,15 @@ class ArmadaTimeoutException(ArmadaException): class ProtectedReleaseException(ArmadaException): ''' - Exception that occurs when Armada encounters a FAILED release that is - designated `protected` in the Chart and `continue_processing` is False. + Exception that occurs when Armada encounters a release with status other + than DEPLOYED that is designated `protected` in the Chart and + `continue_processing` is False. ''' - def __init__(self, reason): + def __init__(self, release, status): self._message = ( - 'Armada encountered protected release %s in FAILED status' % - reason) + 'Armada encountered protected release {} in {} status'.format( + release, status)) super(ProtectedReleaseException, self).__init__(self._message) @@ -88,13 +89,16 @@ class WaitException(ArmadaException): super(WaitException, self).__init__(message) -class UnexpectedReleaseStatusException(ArmadaException): +class DeploymentLikelyPendingException(ArmadaException): ''' - Exception that occurs when armada encounters an existing release for a - chart with an unexpected status which armada does not know what to do with. + Exception that occurs when it is detected that an existing release + operation (e.g. install, update, rollback, delete) is likely still pending. ''' - def __init__(self, release_name, status): - self._message = "Found release {} in unexpected status {}".format( - release_name, status) - super(UnexpectedReleaseStatusException, self).__init__(self._message) + def __init__(self, release, status, last_deployment_age, timeout): + self._message = ( + 'Existing deployment likely pending ' + 'release={}, status={}, ' + '(last deployment age={}s) < (chart wait timeout={}s)'.format( + release, status, last_deployment_age, timeout)) + super(DeploymentLikelyPendingException, self).__init__(self._message) diff --git a/armada/handlers/chart_deploy.py b/armada/handlers/chart_deploy.py index 203a1459..76a3b128 100644 --- a/armada/handlers/chart_deploy.py +++ b/armada/handlers/chart_deploy.py @@ -52,19 +52,12 @@ class ChartDeploy(object): result = {} - protected = chart.get('protected', {}) - p_continue = protected.get('continue_processing', False) - old_release = self.find_chart_release(known_releases, release_name) status = None if old_release: status = r.get_release_status(old_release) - if status not in [const.STATUS_FAILED, const.STATUS_DEPLOYED]: - raise armada_exceptions.UnexpectedReleaseStatusException( - release_name, status) - chart_wait = ChartWait( self.tiller.k8s, release_name, @@ -82,29 +75,6 @@ class ChartDeploy(object): chartbuilder = ChartBuilder(chart) new_chart = chartbuilder.get_helm_chart() - # Check for existing FAILED release, and purge - if status == const.STATUS_FAILED: - LOG.info('Purging FAILED release %s before deployment.', - release_name) - if protected: - if p_continue: - LOG.warn( - 'Release %s is `protected`, ' - 'continue_processing=True. Operator must ' - 'handle FAILED release manually.', release_name) - result['protected'] = release_name - return result - else: - LOG.error( - 'Release %s is `protected`, ' - 'continue_processing=False.', release_name) - raise armada_exceptions.ProtectedReleaseException( - release_name) - else: - # Purge the release - self.tiller.uninstall_release(release_name) - result['purge'] = release_name - # TODO(mark-burnett): It may be more robust to directly call # tiller status to decide whether to install/upgrade rather # than checking for list membership. @@ -181,6 +151,62 @@ class ChartDeploy(object): tiller_result.__dict__) result['upgrade'] = release_name else: + # Check for release with status other than DEPLOYED + if status: + if status != const.STATUS_FAILED: + LOG.warn( + 'Unexpected release status encountered ' + 'release=%s, status=%s', release_name, status) + + # Make best effort to determine whether a deployment is + # likely pending, by checking if the last deployment + # was started within the timeout window of the chart. + last_deployment_age = r.get_last_deployment_age( + old_release) + wait_timeout = chart_wait.get_timeout() + likely_pending = last_deployment_age <= wait_timeout + if likely_pending: + # Give up if a deployment is likely pending, we do not + # want to have multiple operations going on for the + # same release at the same time. + raise armada_exceptions.\ + DeploymentLikelyPendingException( + release_name, status, last_deployment_age, + wait_timeout) + else: + # Release is likely stuck in an unintended (by tiller) + # state. Log and continue on with remediation steps + # below. + LOG.info( + 'Old release %s likely stuck in status %s, ' + '(last deployment age=%ss) >= ' + '(chart wait timeout=%ss)', release, status, + last_deployment_age, wait_timeout) + + protected = chart.get('protected', {}) + if protected: + p_continue = protected.get('continue_processing', False) + if p_continue: + LOG.warn( + 'Release %s is `protected`, ' + 'continue_processing=True. Operator must ' + 'handle %s release manually.', release_name, + status) + result['protected'] = release_name + return result + else: + LOG.error( + 'Release %s is `protected`, ' + 'continue_processing=False.', release_name) + raise armada_exceptions.ProtectedReleaseException( + release_name, status) + else: + # Purge the release + LOG.info('Purging release %s with status %s', release_name, + status) + self.tiller.uninstall_release(release_name) + result['purge'] = release_name + timer = int(round(deadline - time.time())) LOG.info( "Installing release %s in namespace %s, wait=%s, " diff --git a/armada/utils/release.py b/armada/utils/release.py index dbd51d9b..55fdda75 100644 --- a/armada/utils/release.py +++ b/armada/utils/release.py @@ -14,6 +14,8 @@ from armada.handlers.test import Test +import time + def release_prefixer(prefix, release): ''' @@ -53,3 +55,17 @@ def get_last_test_result(release): if not status.HasField('last_test_suite_run'): return None return Test.get_test_suite_run_success(status.last_test_suite_run) + + +def get_last_deployment_age(release): + """ + :param release: protobuf release object + + :return: age in seconds of last deployment of release + """ + + last_deployed = release.info.last_deployed.seconds + now = int(time.time()) + last_deployment_age = now - last_deployed + + return last_deployment_age diff --git a/doc/source/operations/exceptions/armada-exceptions.inc b/doc/source/operations/exceptions/armada-exceptions.inc index a86679a3..87ca4989 100644 --- a/doc/source/operations/exceptions/armada-exceptions.inc +++ b/doc/source/operations/exceptions/armada-exceptions.inc @@ -47,7 +47,7 @@ Armada Exceptions :show-inheritance: :undoc-members: -.. autoexception:: armada.exceptions.armada_exceptions.UnexpectedReleaseStatusException +.. autoexception:: armada.exceptions.armada_exceptions.DeploymentLikelyPendingException :members: :show-inheritance: :undoc-members: