From e0b04e829bb3cd782b30145444e3d6f6a57ec797 Mon Sep 17 00:00:00 2001 From: Mark Burnett Date: Thu, 15 Feb 2018 11:54:03 -0600 Subject: [PATCH] Fix: Issue where Armada hangs waiting for pods This also hopefully provides better logging when waiting for pods. Closes #194 Change-Id: I3704ff004c35c8ecf90555d16e42f15d24284492 --- armada/api/controller/armada.py | 1 + armada/cli/__init__.py | 15 +- armada/cli/apply.py | 2 +- armada/cli/delete.py | 4 +- armada/cli/test.py | 2 +- armada/cli/tiller.py | 2 +- armada/cli/validate.py | 2 +- armada/exceptions/k8s_exceptions.py | 33 ++++ armada/handlers/k8s.py | 144 +++++++++++------- .../exceptions/guide-exceptions.rst | 1 + .../operations/exceptions/k8s-exceptions.rst | 33 ++++ 11 files changed, 180 insertions(+), 59 deletions(-) create mode 100644 armada/exceptions/k8s_exceptions.py create mode 100644 docs/source/operations/exceptions/k8s-exceptions.rst diff --git a/armada/api/controller/armada.py b/armada/api/controller/armada.py index 6ebfcdd5..b3c5c227 100644 --- a/armada/api/controller/armada.py +++ b/armada/api/controller/armada.py @@ -102,6 +102,7 @@ class Apply(api.BaseResource): except exceptions.ManifestException as e: self.return_error(resp, falcon.HTTP_400, message=str(e)) except Exception as e: + self.logger.exception('Caught unexpected exception') err_message = 'Failed to apply manifest: {}'.format(e) self.error(req.context, err_message) self.return_error( diff --git a/armada/cli/__init__.py b/armada/cli/__init__.py index 3b8cde86..0256c8d3 100644 --- a/armada/cli/__init__.py +++ b/armada/cli/__init__.py @@ -12,10 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import sys + from oslo_config import cfg from oslo_log import log as logging from armada import conf +from armada.exceptions.base_exception import ArmadaBaseException conf.set_app_default_configs() conf.set_default_for_default_log_levels() @@ -32,5 +35,15 @@ class CliAction(object): logging.set_defaults(default_log_levels=CONF.default_log_levels) logging.setup(CONF, 'armada') + def safe_invoke(self): + try: + self.invoke() + except ArmadaBaseException: + self.logger.exception('Caught internal exception') + sys.exit(1) + except: + self.logger.exception('Caught unexpected exception') + sys.exit(1) + def invoke(self): - raise Exception() + raise NotImplementedError() diff --git a/armada/cli/apply.py b/armada/cli/apply.py index a38804ab..d626222b 100644 --- a/armada/cli/apply.py +++ b/armada/cli/apply.py @@ -133,7 +133,7 @@ def apply_create(ctx, locations, api, disable_update_post, disable_update_pre, ApplyManifest(ctx, locations, api, disable_update_post, disable_update_pre, dry_run, enable_chart_cleanup, set, tiller_host, tiller_port, tiller_namespace, timeout, values, wait, - target_manifest).invoke() + target_manifest).safe_invoke() class ApplyManifest(CliAction): diff --git a/armada/cli/delete.py b/armada/cli/delete.py index be5db330..4878829a 100644 --- a/armada/cli/delete.py +++ b/armada/cli/delete.py @@ -73,8 +73,8 @@ SHORT_DESC = "Command deletes releases." default=44134) @click.pass_context def delete_charts(ctx, manifest, releases, no_purge, tiller_host, tiller_port): - DeleteChartManifest( - ctx, manifest, releases, no_purge, tiller_host, tiller_port).invoke() + DeleteChartManifest(ctx, manifest, releases, no_purge, tiller_host, + tiller_port).safe_invoke() class DeleteChartManifest(CliAction): diff --git a/armada/cli/test.py b/armada/cli/test.py index e02ebf13..01f2267c 100644 --- a/armada/cli/test.py +++ b/armada/cli/test.py @@ -82,7 +82,7 @@ def test_charts(ctx, file, release, tiller_host, tiller_port, tiller_namespace, target_manifest): TestChartManifest( ctx, file, release, tiller_host, tiller_port, tiller_namespace, - target_manifest).invoke() + target_manifest).safe_invoke() class TestChartManifest(CliAction): diff --git a/armada/cli/tiller.py b/armada/cli/tiller.py index 226b3b0d..307167da 100644 --- a/armada/cli/tiller.py +++ b/armada/cli/tiller.py @@ -71,7 +71,7 @@ SHORT_DESC = "Command gets Tiller information." def tiller_service(ctx, tiller_host, tiller_port, tiller_namespace, releases, status): TillerServices(ctx, tiller_host, tiller_port, tiller_namespace, releases, - status).invoke() + status).safe_invoke() class TillerServices(CliAction): diff --git a/armada/cli/validate.py b/armada/cli/validate.py index a43d6ec9..0cbb9246 100644 --- a/armada/cli/validate.py +++ b/armada/cli/validate.py @@ -48,7 +48,7 @@ SHORT_DESC = "Command validates Armada Manifest." nargs=-1) @click.pass_context def validate_manifest(ctx, locations): - ValidateManifest(ctx, locations).invoke() + ValidateManifest(ctx, locations).safe_invoke() class ValidateManifest(CliAction): diff --git a/armada/exceptions/k8s_exceptions.py b/armada/exceptions/k8s_exceptions.py new file mode 100644 index 00000000..82ffc6d9 --- /dev/null +++ b/armada/exceptions/k8s_exceptions.py @@ -0,0 +1,33 @@ +# Copyright 2018 The Armada Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from armada.exceptions.base_exception import ArmadaBaseException as ex + + +class KubernetesException(ex): + '''Base class for Kubernetes exceptions and error handling.''' + + message = 'An unknown Kubernetes error occured.' + + +class KubernetesUnknownStreamingEventTypeException(KubernetesException): + '''Exception for getting an unknown event type from the Kubernetes API''' + + message = 'An unknown event type was returned from the streaming API.' + + +class KubernetesErrorEventException(KubernetesException): + '''Exception for getting an error from the Kubernetes API''' + + message = 'An error event was returned from the streaming API.' diff --git a/armada/handlers/k8s.py b/armada/handlers/k8s.py index f140c558..672de56e 100644 --- a/armada/handlers/k8s.py +++ b/armada/handlers/k8s.py @@ -23,12 +23,15 @@ from oslo_config import cfg from oslo_log import log as logging from armada.utils.release import label_selectors +from armada.exceptions import k8s_exceptions as exceptions LOG = logging.getLogger(__name__) CONF = cfg.CONF +READY_PHASES = {'Running', 'Succeeded'} + class K8s(object): ''' @@ -207,77 +210,114 @@ class K8s(object): namespace='default', labels='', timeout=300, - sleep=15): + sleep=15, + required_successes=3, + inter_success_wait=10): ''' :param release - part of namespace :param timeout - time before disconnecting stream ''' - LOG.debug("Wait on %s for %s sec", namespace, timeout) - label_selector = '' if labels: label_selector = label_selectors(labels) - valid_state = ['Succeeded', 'Running'] + LOG.debug("Wait on %s (%s) for %s sec", namespace, label_selector, + timeout) - wait_timeout = time.time() + 60 * timeout + deadline = time.time() + timeout - while True: + # NOTE(mark-burnett): Attempt to wait multiple times without + # modification, in case new pods appear after our watch exits. - self.is_pods_ready(label_selector=label_selector, timeout=timeout) + successes = 0 + while successes < required_successes: + deadline_remaining = int(deadline - time.time()) + if deadline_remaining <= 0: + return False + timed_out, modified_pods, unready_pods = self.wait_one_time( + label_selector, timeout=deadline_remaining) - pod_ready = [] - not_ready = [] - for pod in self.client.list_pod_for_all_namespaces( - label_selector=label_selector).items: - p_state = pod.status.phase - p_name = pod.metadata.name - if p_state in valid_state: - pod_ready.append(True) - continue + if timed_out: + LOG.info('Timed out waiting for pods: %s', unready_pods) + return False - pod_ready.append(False) - not_ready.append(p_name) - - LOG.debug('%s', p_state) - - if time.time() > wait_timeout or all(pod_ready): - LOG.debug("Pod States %s", pod_ready) - break - if time.time() > wait_timeout and not all(pod_ready): - LOG.exception( - 'Failed to bring up release %s: %s', release, not_ready) - break + if modified_pods: + successes = 0 + LOG.debug('Continuing to wait, found modified pods: %s', + modified_pods) else: - LOG.debug('time: %s pod %s', wait_timeout, pod_ready) + successes += 1 + LOG.debug('Found no modified pods this attempt. successes=%d', + successes) - def is_pods_ready(self, label_selector='', timeout=100): - ''' - :params release_labels - list of labels to identify relevant pods - :params namespace - namespace in which to search for pods + time.sleep(inter_success_wait) - Returns after waiting for all pods to enter Ready state - ''' - pods_found = [] - valid_state = ['Succeeded', 'Running'] + return True + def wait_one_time(self, label_selector='', timeout=100): + LOG.debug('Starting to wait: label_selector=%s, timeout=%s', + label_selector, timeout) + ready_pods = {} + modified_pods = set() w = watch.Watch() - for pod in w.stream(self.client.list_pod_for_all_namespaces, - label_selector=label_selector, - timeout_seconds=timeout): + first_event = True + for event in w.stream(self.client.list_pod_for_all_namespaces, + label_selector=label_selector, + timeout_seconds=timeout): + if first_event: + pod_list = self.client.list_pod_for_all_namespaces( + label_selector=label_selector, + timeout_seconds=timeout) + for pod in pod_list.items: + LOG.debug('Setting up to wait for pod %s', + pod.metadata.name) + ready_pods[pod.metadata.name] = False + first_event = False - pod_name = pod['object'].metadata.name - pod_state = pod['object'].status.phase + event_type = event['type'].upper() + pod_name = event['object'].metadata.name - if pod['type'] == 'ADDED' and pod_state not in valid_state: - LOG.debug("Pod %s in %s", pod_name, pod_state) - pods_found.append(pod_name) - elif pod_name in pods_found: - if pod_state in valid_state: - pods_found.remove(pod_name) - LOG.debug(pods_found) + if event_type in {'ADDED', 'MODIFIED'}: + status = event['object'].status + is_ready = status.phase in READY_PHASES - if not pods_found: - LOG.debug('Terminate wait') - w.stop() + if is_ready: + LOG.debug('Pod %s (%s) is_ready=%s', pod_name, event_type, + is_ready) + else: + container_statuses = status.container_statuses + conditions = status.conditions + LOG.debug('Pod %s (%s) is_ready=%s container_statuses=%s ' + 'conditions=%s', pod_name, event_type, is_ready, + container_statuses, conditions) + + ready_pods[pod_name] = is_ready + + if event_type == 'MODIFIED': + modified_pods.add(pod_name) + + elif event_type == 'DELETED': + LOG.debug('Removing pod %s from tracking', pod_name) + ready_pods.pop(pod_name) + + elif event_type == 'ERROR': + LOG.error('Got error event for pod: %s', + event['object'].to_dict()) + raise exceptions.KubernetesErrorEventException( + 'Got error event for pod: %s' % event['object']) + + else: + LOG.error('Unrecognized event type (%s) for pod: %s', + event_type, event['object']) + raise exceptions.KubernetesUnknownStreamingEventTypeException( + 'Got unknown event type (%s) for pod: %s' + % (event_type, event['object'])) + + if all(ready_pods.values()): + return (False, modified_pods, []) + + # NOTE(mark-burnett): This path is reachable if there are no pods + # (unlikely) or in the case of the watch timing out. + return (not all(ready_pods.values()), modified_pods, + [name for name, ready in ready_pods.items() if not ready]) diff --git a/docs/source/operations/exceptions/guide-exceptions.rst b/docs/source/operations/exceptions/guide-exceptions.rst index f10407e4..5b58e631 100644 --- a/docs/source/operations/exceptions/guide-exceptions.rst +++ b/docs/source/operations/exceptions/guide-exceptions.rst @@ -21,6 +21,7 @@ Armada Exceptions .. include:: armada-exceptions.inc .. include:: base-exceptions.inc .. include:: chartbuilder-exceptions.inc +.. include:: k8s-exceptions.inc .. include:: lint-exceptions.inc .. include:: manifest-exceptions.inc .. include:: override-exceptions.inc diff --git a/docs/source/operations/exceptions/k8s-exceptions.rst b/docs/source/operations/exceptions/k8s-exceptions.rst new file mode 100644 index 00000000..d5a4ef8e --- /dev/null +++ b/docs/source/operations/exceptions/k8s-exceptions.rst @@ -0,0 +1,33 @@ +.. + Copyright 2018 AT&T Intellectual Property. + All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); you may + not use this file except in compliance with the License. You may obtain + a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + License for the specific language governing permissions and limitations + under the License. + + +.. list-table:: **Kubernetes Exceptions** + :widths: 5 50 + :header-rows: 1 + + * - Exception Name + - Description + * - KubernetesErrorEventException + - .. autoexception:: armada.exceptions.k8s_exceptions.KubernetesErrorEventException + :members: + :show-inheritance: + :undoc-members: + * - KubernetesUnknownStreamngEventTypeException + - .. autoexception:: armada.exceptions.k8s_exceptions.KubernetesUnknownStreamngEventTypeException + :members: + :show-inheritance: + :undoc-members: