summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSean Eagan <sean.eagan@att.com>2019-01-16 14:10:10 -0600
committerDrew Walters <drewwalters96@gmail.com>2019-01-18 23:06:01 +0000
commit2310ddbc2cd7b676ea171ecb88b317f44bc1d05b (patch)
treea53c559cd4d507e788618ed0386359d307c83c4b
parentb0d9506906d0fae2a6f67ab0ff950060550b094a (diff)
Remediate releases stuck in non-DEPLOYED statuses
Armada remediates releases stuck in FAILED status, if not protected, by purging and re-installing them. This implements the same for other non-DEPLOYED statuses. For these statuses it guards this with a best effort determination of whether a previous deployment of the release, either through armada or the helm CLI, is likely still pending based on whether it was last deployed within the chart's wait timeout. If it is deemed likely pending an error is raised, however this condition will eventually expire on future runs allowing for eventual remediation. Reasons why a release may get stuck in statuses other than DEPLOYED or FAILED include: 1. tiller crashed mid-deployment 2. tiller could not reach kubernetes to update the release state 3. running `helm delete <rel>` (without --purge) (DELETED status) Change-Id: Ia89cd59f056103dde47980a149c07a2984c4bbb4
Notes
Notes (review): Code-Review+2: Scott Hussey <sthussey@att.com> Code-Review+1: Drew Walters <drewwalters96@gmail.com> Code-Review+2: Bryan Strassner <strassner.bryan@gmail.com> Workflow+1: Bryan Strassner <strassner.bryan@gmail.com> Verified+2: Zuul Submitted-by: Zuul Submitted-at: Mon, 28 Jan 2019 18:15:50 +0000 Reviewed-on: https://review.openstack.org/631328 Project: openstack/airship-armada Branch: refs/heads/master
-rw-r--r--armada/exceptions/armada_exceptions.py28
-rw-r--r--armada/handlers/chart_deploy.py86
-rw-r--r--armada/utils/release.py16
-rw-r--r--doc/source/operations/exceptions/armada-exceptions.inc2
4 files changed, 89 insertions, 43 deletions
diff --git a/armada/exceptions/armada_exceptions.py b/armada/exceptions/armada_exceptions.py
index 052a66e..4438e9c 100644
--- a/armada/exceptions/armada_exceptions.py
+++ b/armada/exceptions/armada_exceptions.py
@@ -31,14 +31,15 @@ class ArmadaTimeoutException(ArmadaException):
31 31
32class ProtectedReleaseException(ArmadaException): 32class ProtectedReleaseException(ArmadaException):
33 ''' 33 '''
34 Exception that occurs when Armada encounters a FAILED release that is 34 Exception that occurs when Armada encounters a release with status other
35 designated `protected` in the Chart and `continue_processing` is False. 35 than DEPLOYED that is designated `protected` in the Chart and
36 `continue_processing` is False.
36 ''' 37 '''
37 38
38 def __init__(self, reason): 39 def __init__(self, release, status):
39 self._message = ( 40 self._message = (
40 'Armada encountered protected release %s in FAILED status' % 41 'Armada encountered protected release {} in {} status'.format(
41 reason) 42 release, status))
42 super(ProtectedReleaseException, self).__init__(self._message) 43 super(ProtectedReleaseException, self).__init__(self._message)
43 44
44 45
@@ -88,13 +89,16 @@ class WaitException(ArmadaException):
88 super(WaitException, self).__init__(message) 89 super(WaitException, self).__init__(message)
89 90
90 91
91class UnexpectedReleaseStatusException(ArmadaException): 92class DeploymentLikelyPendingException(ArmadaException):
92 ''' 93 '''
93 Exception that occurs when armada encounters an existing release for a 94 Exception that occurs when it is detected that an existing release
94 chart with an unexpected status which armada does not know what to do with. 95 operation (e.g. install, update, rollback, delete) is likely still pending.
95 ''' 96 '''
96 97
97 def __init__(self, release_name, status): 98 def __init__(self, release, status, last_deployment_age, timeout):
98 self._message = "Found release {} in unexpected status {}".format( 99 self._message = (
99 release_name, status) 100 'Existing deployment likely pending '
100 super(UnexpectedReleaseStatusException, self).__init__(self._message) 101 'release={}, status={}, '
102 '(last deployment age={}s) < (chart wait timeout={}s)'.format(
103 release, status, last_deployment_age, timeout))
104 super(DeploymentLikelyPendingException, self).__init__(self._message)
diff --git a/armada/handlers/chart_deploy.py b/armada/handlers/chart_deploy.py
index 203a145..76a3b12 100644
--- a/armada/handlers/chart_deploy.py
+++ b/armada/handlers/chart_deploy.py
@@ -52,19 +52,12 @@ class ChartDeploy(object):
52 52
53 result = {} 53 result = {}
54 54
55 protected = chart.get('protected', {})
56 p_continue = protected.get('continue_processing', False)
57
58 old_release = self.find_chart_release(known_releases, release_name) 55 old_release = self.find_chart_release(known_releases, release_name)
59 56
60 status = None 57 status = None
61 if old_release: 58 if old_release:
62 status = r.get_release_status(old_release) 59 status = r.get_release_status(old_release)
63 60
64 if status not in [const.STATUS_FAILED, const.STATUS_DEPLOYED]:
65 raise armada_exceptions.UnexpectedReleaseStatusException(
66 release_name, status)
67
68 chart_wait = ChartWait( 61 chart_wait = ChartWait(
69 self.tiller.k8s, 62 self.tiller.k8s,
70 release_name, 63 release_name,
@@ -82,29 +75,6 @@ class ChartDeploy(object):
82 chartbuilder = ChartBuilder(chart) 75 chartbuilder = ChartBuilder(chart)
83 new_chart = chartbuilder.get_helm_chart() 76 new_chart = chartbuilder.get_helm_chart()
84 77
85 # Check for existing FAILED release, and purge
86 if status == const.STATUS_FAILED:
87 LOG.info('Purging FAILED release %s before deployment.',
88 release_name)
89 if protected:
90 if p_continue:
91 LOG.warn(
92 'Release %s is `protected`, '
93 'continue_processing=True. Operator must '
94 'handle FAILED release manually.', release_name)
95 result['protected'] = release_name
96 return result
97 else:
98 LOG.error(
99 'Release %s is `protected`, '
100 'continue_processing=False.', release_name)
101 raise armada_exceptions.ProtectedReleaseException(
102 release_name)
103 else:
104 # Purge the release
105 self.tiller.uninstall_release(release_name)
106 result['purge'] = release_name
107
108 # TODO(mark-burnett): It may be more robust to directly call 78 # TODO(mark-burnett): It may be more robust to directly call
109 # tiller status to decide whether to install/upgrade rather 79 # tiller status to decide whether to install/upgrade rather
110 # than checking for list membership. 80 # than checking for list membership.
@@ -181,6 +151,62 @@ class ChartDeploy(object):
181 tiller_result.__dict__) 151 tiller_result.__dict__)
182 result['upgrade'] = release_name 152 result['upgrade'] = release_name
183 else: 153 else:
154 # Check for release with status other than DEPLOYED
155 if status:
156 if status != const.STATUS_FAILED:
157 LOG.warn(
158 'Unexpected release status encountered '
159 'release=%s, status=%s', release_name, status)
160
161 # Make best effort to determine whether a deployment is
162 # likely pending, by checking if the last deployment
163 # was started within the timeout window of the chart.
164 last_deployment_age = r.get_last_deployment_age(
165 old_release)
166 wait_timeout = chart_wait.get_timeout()
167 likely_pending = last_deployment_age <= wait_timeout
168 if likely_pending:
169 # Give up if a deployment is likely pending, we do not
170 # want to have multiple operations going on for the
171 # same release at the same time.
172 raise armada_exceptions.\
173 DeploymentLikelyPendingException(
174 release_name, status, last_deployment_age,
175 wait_timeout)
176 else:
177 # Release is likely stuck in an unintended (by tiller)
178 # state. Log and continue on with remediation steps
179 # below.
180 LOG.info(
181 'Old release %s likely stuck in status %s, '
182 '(last deployment age=%ss) >= '
183 '(chart wait timeout=%ss)', release, status,
184 last_deployment_age, wait_timeout)
185
186 protected = chart.get('protected', {})
187 if protected:
188 p_continue = protected.get('continue_processing', False)
189 if p_continue:
190 LOG.warn(
191 'Release %s is `protected`, '
192 'continue_processing=True. Operator must '
193 'handle %s release manually.', release_name,
194 status)
195 result['protected'] = release_name
196 return result
197 else:
198 LOG.error(
199 'Release %s is `protected`, '
200 'continue_processing=False.', release_name)
201 raise armada_exceptions.ProtectedReleaseException(
202 release_name, status)
203 else:
204 # Purge the release
205 LOG.info('Purging release %s with status %s', release_name,
206 status)
207 self.tiller.uninstall_release(release_name)
208 result['purge'] = release_name
209
184 timer = int(round(deadline - time.time())) 210 timer = int(round(deadline - time.time()))
185 LOG.info( 211 LOG.info(
186 "Installing release %s in namespace %s, wait=%s, " 212 "Installing release %s in namespace %s, wait=%s, "
diff --git a/armada/utils/release.py b/armada/utils/release.py
index dbd51d9..55fdda7 100644
--- a/armada/utils/release.py
+++ b/armada/utils/release.py
@@ -14,6 +14,8 @@
14 14
15from armada.handlers.test import Test 15from armada.handlers.test import Test
16 16
17import time
18
17 19
18def release_prefixer(prefix, release): 20def release_prefixer(prefix, release):
19 ''' 21 '''
@@ -53,3 +55,17 @@ def get_last_test_result(release):
53 if not status.HasField('last_test_suite_run'): 55 if not status.HasField('last_test_suite_run'):
54 return None 56 return None
55 return Test.get_test_suite_run_success(status.last_test_suite_run) 57 return Test.get_test_suite_run_success(status.last_test_suite_run)
58
59
60def get_last_deployment_age(release):
61 """
62 :param release: protobuf release object
63
64 :return: age in seconds of last deployment of release
65 """
66
67 last_deployed = release.info.last_deployed.seconds
68 now = int(time.time())
69 last_deployment_age = now - last_deployed
70
71 return last_deployment_age
diff --git a/doc/source/operations/exceptions/armada-exceptions.inc b/doc/source/operations/exceptions/armada-exceptions.inc
index a86679a..87ca498 100644
--- a/doc/source/operations/exceptions/armada-exceptions.inc
+++ b/doc/source/operations/exceptions/armada-exceptions.inc
@@ -47,7 +47,7 @@ Armada Exceptions
47 :show-inheritance: 47 :show-inheritance:
48 :undoc-members: 48 :undoc-members:
49 49
50.. autoexception:: armada.exceptions.armada_exceptions.UnexpectedReleaseStatusException 50.. autoexception:: armada.exceptions.armada_exceptions.DeploymentLikelyPendingException
51 :members: 51 :members:
52 :show-inheritance: 52 :show-inheritance:
53 :undoc-members: 53 :undoc-members: