From 88c200fc18fab77d1b935249803ab05a628e2d64 Mon Sep 17 00:00:00 2001 From: Nishant Kumar Date: Mon, 13 Jul 2020 18:22:55 +0000 Subject: [PATCH] [chart_deploy] Handle unknow chart status in a more resilient manner In cases when chart is in a pending state and last deployment age for chart is less than the chart wait timeout, armada can consume all the retries without attempting to purge and re-install the chart. This PS addresses the above issue by waiting for the chart until the wait timeout is met and then proceeding ahead with usual armada process of purging and re-installing the chart if the status is not DEPLOYED. Other code changes in this PS is just re-arranging the existing logic to fix some edge cases and solve pep8 issues. Change-Id: Id11d0fb06201264015b3064a43e83903f99a30b1 --- armada/handlers/chart_deploy.py | 140 ++++++++++++++++++-------------- 1 file changed, 78 insertions(+), 62 deletions(-) diff --git a/armada/handlers/chart_deploy.py b/armada/handlers/chart_deploy.py index 8821bfce..601c709c 100644 --- a/armada/handlers/chart_deploy.py +++ b/armada/handlers/chart_deploy.py @@ -185,68 +185,6 @@ class ChartDeploy(object): deploy = upgrade else: - # Check for release with status other than DEPLOYED - if status: - if status != const.STATUS_FAILED: - LOG.warn( - 'Unexpected release status encountered ' - 'release=%s, status=%s', release_name, status) - - # Make best effort to determine whether a deployment is - # likely pending, by checking if the last deployment - # was started within the timeout window of the chart. - last_deployment_age = r.get_last_deployment_age( - old_release) - likely_pending = last_deployment_age <= wait_timeout - if likely_pending: - # Give up if a deployment is likely pending, we do not - # want to have multiple operations going on for the - # same release at the same time. - raise armada_exceptions.\ - DeploymentLikelyPendingException( - release_name, status, last_deployment_age, - wait_timeout) - else: - # Release is likely stuck in an unintended (by tiller) - # state. Log and continue on with remediation steps - # below. - LOG.info( - 'Old release %s likely stuck in status %s, ' - '(last deployment age=%ss) >= ' - '(chart wait timeout=%ss)', release, status, - last_deployment_age, wait_timeout) - - protected = chart.get('protected', {}) - if protected: - p_continue = protected.get('continue_processing', False) - if p_continue: - LOG.warn( - 'Release %s is `protected`, ' - 'continue_processing=True. Operator must ' - 'handle %s release manually.', release_name, - status) - result['protected'] = release_name - return result - else: - LOG.error( - 'Release %s is `protected`, ' - 'continue_processing=False.', release_name) - raise armada_exceptions.ProtectedReleaseException( - release_name, status) - else: - # Purge the release - with metrics.CHART_DELETE.get_context(manifest_name, - chart_name): - - LOG.info( - 'Purging release %s with status %s', release_name, - status) - chart_delete = ChartDelete( - chart, release_name, self.tiller) - chart_delete.delete() - result['purge'] = release_name - - action = metrics.ChartDeployAction.INSTALL def install(): timer = int(round(deadline - time.time())) @@ -267,6 +205,55 @@ class ChartDeploy(object): tiller_result.__dict__) result['install'] = release_name + # Check for release with status other than DEPLOYED + if status: + if status != const.STATUS_FAILED: + LOG.warn( + 'Unexpected release status encountered ' + 'release=%s, status=%s', release_name, status) + + # Make best effort to determine whether a deployment is + # likely pending, by checking if the last deployment + # was started within the timeout window of the chart. + last_deployment_age = r.get_last_deployment_age( + old_release) + likely_pending = last_deployment_age <= wait_timeout + if likely_pending: + # We don't take any deploy action and wait for the + # to get deployed. + deploy = noop + deadline = deadline - last_deployment_age + else: + # Release is likely stuck in an unintended (by tiller) + # state. Log and continue on with remediation steps + # below. + LOG.info( + 'Old release %s likely stuck in status %s, ' + '(last deployment age=%ss) >= ' + '(chart wait timeout=%ss)', release, status, + last_deployment_age, wait_timeout) + res = self.purge_release( + chart, release_name, status, manifest_name, + chart_name, result) + if isinstance(res, dict): + if 'protected' in res: + return res + action = metrics.ChartDeployAction.INSTALL + deploy = install + else: + # The chart is in Failed state, hence we purge + # the chart and attempt to install it again. + res = self.purge_release( + chart, release_name, status, manifest_name, chart_name, + result) + if isinstance(res, dict): + if 'protected' in res: + return res + action = metrics.ChartDeployAction.INSTALL + deploy = install + + if status is None: + action = metrics.ChartDeployAction.INSTALL deploy = install # Deploy @@ -298,6 +285,35 @@ class ChartDeploy(object): return result + def purge_release( + self, chart, release_name, status, manifest_name, chart_name, + result): + protected = chart.get('protected', {}) + if protected: + p_continue = protected.get('continue_processing', False) + if p_continue: + LOG.warn( + 'Release %s is `protected`, ' + 'continue_processing=True. Operator must ' + 'handle %s release manually.', release_name, status) + result['protected'] = release_name + return result + else: + LOG.error( + 'Release %s is `protected`, ' + 'continue_processing=False.', release_name) + raise armada_exceptions.ProtectedReleaseException( + release_name, status) + else: + # Purge the release + with metrics.CHART_DELETE.get_context(manifest_name, chart_name): + + LOG.info( + 'Purging release %s with status %s', release_name, status) + chart_delete = ChartDelete(chart, release_name, self.tiller) + chart_delete.delete() + result['purge'] = release_name + def _test_chart(self, release_name, test_handler): success = test_handler.test_release_for_success() if not success: