[chart_deploy] Handle unknow chart status in a more resilient manner

In cases when chart is in a pending state and last deployment age
for chart is less than the chart wait timeout, armada can consume
all the retries without attempting to purge and re-install the
chart.

This PS addresses the above issue by waiting for the chart until
the wait timeout is met and then proceeding ahead with usual
armada process of purging and re-installing the chart if the status
is not DEPLOYED.

Other code changes in this PS is just re-arranging the existing logic to
fix some edge cases and solve pep8 issues.

Change-Id: Id11d0fb06201264015b3064a43e83903f99a30b1
This commit is contained in:
Nishant Kumar 2020-07-13 18:22:55 +00:00 committed by Nishant Kumar
parent d31aefb76c
commit 88c200fc18
1 changed files with 78 additions and 62 deletions

View File

@ -185,68 +185,6 @@ class ChartDeploy(object):
deploy = upgrade
else:
# Check for release with status other than DEPLOYED
if status:
if status != const.STATUS_FAILED:
LOG.warn(
'Unexpected release status encountered '
'release=%s, status=%s', release_name, status)
# Make best effort to determine whether a deployment is
# likely pending, by checking if the last deployment
# was started within the timeout window of the chart.
last_deployment_age = r.get_last_deployment_age(
old_release)
likely_pending = last_deployment_age <= wait_timeout
if likely_pending:
# Give up if a deployment is likely pending, we do not
# want to have multiple operations going on for the
# same release at the same time.
raise armada_exceptions.\
DeploymentLikelyPendingException(
release_name, status, last_deployment_age,
wait_timeout)
else:
# Release is likely stuck in an unintended (by tiller)
# state. Log and continue on with remediation steps
# below.
LOG.info(
'Old release %s likely stuck in status %s, '
'(last deployment age=%ss) >= '
'(chart wait timeout=%ss)', release, status,
last_deployment_age, wait_timeout)
protected = chart.get('protected', {})
if protected:
p_continue = protected.get('continue_processing', False)
if p_continue:
LOG.warn(
'Release %s is `protected`, '
'continue_processing=True. Operator must '
'handle %s release manually.', release_name,
status)
result['protected'] = release_name
return result
else:
LOG.error(
'Release %s is `protected`, '
'continue_processing=False.', release_name)
raise armada_exceptions.ProtectedReleaseException(
release_name, status)
else:
# Purge the release
with metrics.CHART_DELETE.get_context(manifest_name,
chart_name):
LOG.info(
'Purging release %s with status %s', release_name,
status)
chart_delete = ChartDelete(
chart, release_name, self.tiller)
chart_delete.delete()
result['purge'] = release_name
action = metrics.ChartDeployAction.INSTALL
def install():
timer = int(round(deadline - time.time()))
@ -267,6 +205,55 @@ class ChartDeploy(object):
tiller_result.__dict__)
result['install'] = release_name
# Check for release with status other than DEPLOYED
if status:
if status != const.STATUS_FAILED:
LOG.warn(
'Unexpected release status encountered '
'release=%s, status=%s', release_name, status)
# Make best effort to determine whether a deployment is
# likely pending, by checking if the last deployment
# was started within the timeout window of the chart.
last_deployment_age = r.get_last_deployment_age(
old_release)
likely_pending = last_deployment_age <= wait_timeout
if likely_pending:
# We don't take any deploy action and wait for the
# to get deployed.
deploy = noop
deadline = deadline - last_deployment_age
else:
# Release is likely stuck in an unintended (by tiller)
# state. Log and continue on with remediation steps
# below.
LOG.info(
'Old release %s likely stuck in status %s, '
'(last deployment age=%ss) >= '
'(chart wait timeout=%ss)', release, status,
last_deployment_age, wait_timeout)
res = self.purge_release(
chart, release_name, status, manifest_name,
chart_name, result)
if isinstance(res, dict):
if 'protected' in res:
return res
action = metrics.ChartDeployAction.INSTALL
deploy = install
else:
# The chart is in Failed state, hence we purge
# the chart and attempt to install it again.
res = self.purge_release(
chart, release_name, status, manifest_name, chart_name,
result)
if isinstance(res, dict):
if 'protected' in res:
return res
action = metrics.ChartDeployAction.INSTALL
deploy = install
if status is None:
action = metrics.ChartDeployAction.INSTALL
deploy = install
# Deploy
@ -298,6 +285,35 @@ class ChartDeploy(object):
return result
def purge_release(
self, chart, release_name, status, manifest_name, chart_name,
result):
protected = chart.get('protected', {})
if protected:
p_continue = protected.get('continue_processing', False)
if p_continue:
LOG.warn(
'Release %s is `protected`, '
'continue_processing=True. Operator must '
'handle %s release manually.', release_name, status)
result['protected'] = release_name
return result
else:
LOG.error(
'Release %s is `protected`, '
'continue_processing=False.', release_name)
raise armada_exceptions.ProtectedReleaseException(
release_name, status)
else:
# Purge the release
with metrics.CHART_DELETE.get_context(manifest_name, chart_name):
LOG.info(
'Purging release %s with status %s', release_name, status)
chart_delete = ChartDelete(chart, release_name, self.tiller)
chart_delete.delete()
result['purge'] = release_name
def _test_chart(self, release_name, test_handler):
success = test_handler.test_release_for_success()
if not success: