Remediate releases stuck in non-DEPLOYED statuses

Armada remediates releases stuck in FAILED status, if not protected,
by purging and re-installing them. This implements the same for other
non-DEPLOYED statuses. For these statuses it guards this with a best
effort determination of whether a previous deployment of the release,
either through armada or the helm CLI, is likely still pending based
on whether it was last deployed within the chart's wait timeout. If
it is deemed likely pending an error is raised, however this
condition will eventually expire on future runs allowing for
eventual remediation.

Reasons why a release may get stuck in statuses other than DEPLOYED
or FAILED include:

1. tiller crashed mid-deployment
2. tiller could not reach kubernetes to update the release state
3. running `helm delete <rel>` (without --purge) (DELETED status)

Change-Id: Ia89cd59f056103dde47980a149c07a2984c4bbb4
This commit is contained in:
Sean Eagan 2019-01-16 14:10:10 -06:00 committed by Drew Walters
parent b0d9506906
commit 2310ddbc2c
4 changed files with 89 additions and 43 deletions

View File

@ -31,14 +31,15 @@ class ArmadaTimeoutException(ArmadaException):
class ProtectedReleaseException(ArmadaException):
'''
Exception that occurs when Armada encounters a FAILED release that is
designated `protected` in the Chart and `continue_processing` is False.
Exception that occurs when Armada encounters a release with status other
than DEPLOYED that is designated `protected` in the Chart and
`continue_processing` is False.
'''
def __init__(self, reason):
def __init__(self, release, status):
self._message = (
'Armada encountered protected release %s in FAILED status' %
reason)
'Armada encountered protected release {} in {} status'.format(
release, status))
super(ProtectedReleaseException, self).__init__(self._message)
@ -88,13 +89,16 @@ class WaitException(ArmadaException):
super(WaitException, self).__init__(message)
class UnexpectedReleaseStatusException(ArmadaException):
class DeploymentLikelyPendingException(ArmadaException):
'''
Exception that occurs when armada encounters an existing release for a
chart with an unexpected status which armada does not know what to do with.
Exception that occurs when it is detected that an existing release
operation (e.g. install, update, rollback, delete) is likely still pending.
'''
def __init__(self, release_name, status):
self._message = "Found release {} in unexpected status {}".format(
release_name, status)
super(UnexpectedReleaseStatusException, self).__init__(self._message)
def __init__(self, release, status, last_deployment_age, timeout):
self._message = (
'Existing deployment likely pending '
'release={}, status={}, '
'(last deployment age={}s) < (chart wait timeout={}s)'.format(
release, status, last_deployment_age, timeout))
super(DeploymentLikelyPendingException, self).__init__(self._message)

View File

@ -52,19 +52,12 @@ class ChartDeploy(object):
result = {}
protected = chart.get('protected', {})
p_continue = protected.get('continue_processing', False)
old_release = self.find_chart_release(known_releases, release_name)
status = None
if old_release:
status = r.get_release_status(old_release)
if status not in [const.STATUS_FAILED, const.STATUS_DEPLOYED]:
raise armada_exceptions.UnexpectedReleaseStatusException(
release_name, status)
chart_wait = ChartWait(
self.tiller.k8s,
release_name,
@ -82,29 +75,6 @@ class ChartDeploy(object):
chartbuilder = ChartBuilder(chart)
new_chart = chartbuilder.get_helm_chart()
# Check for existing FAILED release, and purge
if status == const.STATUS_FAILED:
LOG.info('Purging FAILED release %s before deployment.',
release_name)
if protected:
if p_continue:
LOG.warn(
'Release %s is `protected`, '
'continue_processing=True. Operator must '
'handle FAILED release manually.', release_name)
result['protected'] = release_name
return result
else:
LOG.error(
'Release %s is `protected`, '
'continue_processing=False.', release_name)
raise armada_exceptions.ProtectedReleaseException(
release_name)
else:
# Purge the release
self.tiller.uninstall_release(release_name)
result['purge'] = release_name
# TODO(mark-burnett): It may be more robust to directly call
# tiller status to decide whether to install/upgrade rather
# than checking for list membership.
@ -181,6 +151,62 @@ class ChartDeploy(object):
tiller_result.__dict__)
result['upgrade'] = release_name
else:
# Check for release with status other than DEPLOYED
if status:
if status != const.STATUS_FAILED:
LOG.warn(
'Unexpected release status encountered '
'release=%s, status=%s', release_name, status)
# Make best effort to determine whether a deployment is
# likely pending, by checking if the last deployment
# was started within the timeout window of the chart.
last_deployment_age = r.get_last_deployment_age(
old_release)
wait_timeout = chart_wait.get_timeout()
likely_pending = last_deployment_age <= wait_timeout
if likely_pending:
# Give up if a deployment is likely pending, we do not
# want to have multiple operations going on for the
# same release at the same time.
raise armada_exceptions.\
DeploymentLikelyPendingException(
release_name, status, last_deployment_age,
wait_timeout)
else:
# Release is likely stuck in an unintended (by tiller)
# state. Log and continue on with remediation steps
# below.
LOG.info(
'Old release %s likely stuck in status %s, '
'(last deployment age=%ss) >= '
'(chart wait timeout=%ss)', release, status,
last_deployment_age, wait_timeout)
protected = chart.get('protected', {})
if protected:
p_continue = protected.get('continue_processing', False)
if p_continue:
LOG.warn(
'Release %s is `protected`, '
'continue_processing=True. Operator must '
'handle %s release manually.', release_name,
status)
result['protected'] = release_name
return result
else:
LOG.error(
'Release %s is `protected`, '
'continue_processing=False.', release_name)
raise armada_exceptions.ProtectedReleaseException(
release_name, status)
else:
# Purge the release
LOG.info('Purging release %s with status %s', release_name,
status)
self.tiller.uninstall_release(release_name)
result['purge'] = release_name
timer = int(round(deadline - time.time()))
LOG.info(
"Installing release %s in namespace %s, wait=%s, "

View File

@ -14,6 +14,8 @@
from armada.handlers.test import Test
import time
def release_prefixer(prefix, release):
'''
@ -53,3 +55,17 @@ def get_last_test_result(release):
if not status.HasField('last_test_suite_run'):
return None
return Test.get_test_suite_run_success(status.last_test_suite_run)
def get_last_deployment_age(release):
"""
:param release: protobuf release object
:return: age in seconds of last deployment of release
"""
last_deployed = release.info.last_deployed.seconds
now = int(time.time())
last_deployment_age = now - last_deployed
return last_deployment_age

View File

@ -47,7 +47,7 @@ Armada Exceptions
:show-inheritance:
:undoc-members:
.. autoexception:: armada.exceptions.armada_exceptions.UnexpectedReleaseStatusException
.. autoexception:: armada.exceptions.armada_exceptions.DeploymentLikelyPendingException
:members:
:show-inheritance:
:undoc-members: