Fix: Issue where Armada hangs waiting for pods

This also hopefully provides better logging when waiting for pods.

Closes #194

Change-Id: I3704ff004c35c8ecf90555d16e42f15d24284492
This commit is contained in:
Mark Burnett 2018-02-15 11:54:03 -06:00
parent ff2c7d0951
commit e0b04e829b
11 changed files with 180 additions and 59 deletions

View File

@ -102,6 +102,7 @@ class Apply(api.BaseResource):
except exceptions.ManifestException as e: except exceptions.ManifestException as e:
self.return_error(resp, falcon.HTTP_400, message=str(e)) self.return_error(resp, falcon.HTTP_400, message=str(e))
except Exception as e: except Exception as e:
self.logger.exception('Caught unexpected exception')
err_message = 'Failed to apply manifest: {}'.format(e) err_message = 'Failed to apply manifest: {}'.format(e)
self.error(req.context, err_message) self.error(req.context, err_message)
self.return_error( self.return_error(

View File

@ -12,10 +12,13 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import sys
from oslo_config import cfg from oslo_config import cfg
from oslo_log import log as logging from oslo_log import log as logging
from armada import conf from armada import conf
from armada.exceptions.base_exception import ArmadaBaseException
conf.set_app_default_configs() conf.set_app_default_configs()
conf.set_default_for_default_log_levels() conf.set_default_for_default_log_levels()
@ -32,5 +35,15 @@ class CliAction(object):
logging.set_defaults(default_log_levels=CONF.default_log_levels) logging.set_defaults(default_log_levels=CONF.default_log_levels)
logging.setup(CONF, 'armada') logging.setup(CONF, 'armada')
def safe_invoke(self):
try:
self.invoke()
except ArmadaBaseException:
self.logger.exception('Caught internal exception')
sys.exit(1)
except:
self.logger.exception('Caught unexpected exception')
sys.exit(1)
def invoke(self): def invoke(self):
raise Exception() raise NotImplementedError()

View File

@ -133,7 +133,7 @@ def apply_create(ctx, locations, api, disable_update_post, disable_update_pre,
ApplyManifest(ctx, locations, api, disable_update_post, disable_update_pre, ApplyManifest(ctx, locations, api, disable_update_post, disable_update_pre,
dry_run, enable_chart_cleanup, set, tiller_host, tiller_port, dry_run, enable_chart_cleanup, set, tiller_host, tiller_port,
tiller_namespace, timeout, values, wait, tiller_namespace, timeout, values, wait,
target_manifest).invoke() target_manifest).safe_invoke()
class ApplyManifest(CliAction): class ApplyManifest(CliAction):

View File

@ -73,8 +73,8 @@ SHORT_DESC = "Command deletes releases."
default=44134) default=44134)
@click.pass_context @click.pass_context
def delete_charts(ctx, manifest, releases, no_purge, tiller_host, tiller_port): def delete_charts(ctx, manifest, releases, no_purge, tiller_host, tiller_port):
DeleteChartManifest( DeleteChartManifest(ctx, manifest, releases, no_purge, tiller_host,
ctx, manifest, releases, no_purge, tiller_host, tiller_port).invoke() tiller_port).safe_invoke()
class DeleteChartManifest(CliAction): class DeleteChartManifest(CliAction):

View File

@ -82,7 +82,7 @@ def test_charts(ctx, file, release, tiller_host, tiller_port, tiller_namespace,
target_manifest): target_manifest):
TestChartManifest( TestChartManifest(
ctx, file, release, tiller_host, tiller_port, tiller_namespace, ctx, file, release, tiller_host, tiller_port, tiller_namespace,
target_manifest).invoke() target_manifest).safe_invoke()
class TestChartManifest(CliAction): class TestChartManifest(CliAction):

View File

@ -71,7 +71,7 @@ SHORT_DESC = "Command gets Tiller information."
def tiller_service(ctx, tiller_host, tiller_port, tiller_namespace, releases, def tiller_service(ctx, tiller_host, tiller_port, tiller_namespace, releases,
status): status):
TillerServices(ctx, tiller_host, tiller_port, tiller_namespace, releases, TillerServices(ctx, tiller_host, tiller_port, tiller_namespace, releases,
status).invoke() status).safe_invoke()
class TillerServices(CliAction): class TillerServices(CliAction):

View File

@ -48,7 +48,7 @@ SHORT_DESC = "Command validates Armada Manifest."
nargs=-1) nargs=-1)
@click.pass_context @click.pass_context
def validate_manifest(ctx, locations): def validate_manifest(ctx, locations):
ValidateManifest(ctx, locations).invoke() ValidateManifest(ctx, locations).safe_invoke()
class ValidateManifest(CliAction): class ValidateManifest(CliAction):

View File

@ -0,0 +1,33 @@
# Copyright 2018 The Armada Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from armada.exceptions.base_exception import ArmadaBaseException as ex
class KubernetesException(ex):
'''Base class for Kubernetes exceptions and error handling.'''
message = 'An unknown Kubernetes error occured.'
class KubernetesUnknownStreamingEventTypeException(KubernetesException):
'''Exception for getting an unknown event type from the Kubernetes API'''
message = 'An unknown event type was returned from the streaming API.'
class KubernetesErrorEventException(KubernetesException):
'''Exception for getting an error from the Kubernetes API'''
message = 'An error event was returned from the streaming API.'

View File

@ -23,12 +23,15 @@ from oslo_config import cfg
from oslo_log import log as logging from oslo_log import log as logging
from armada.utils.release import label_selectors from armada.utils.release import label_selectors
from armada.exceptions import k8s_exceptions as exceptions
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
CONF = cfg.CONF CONF = cfg.CONF
READY_PHASES = {'Running', 'Succeeded'}
class K8s(object): class K8s(object):
''' '''
@ -207,77 +210,114 @@ class K8s(object):
namespace='default', namespace='default',
labels='', labels='',
timeout=300, timeout=300,
sleep=15): sleep=15,
required_successes=3,
inter_success_wait=10):
''' '''
:param release - part of namespace :param release - part of namespace
:param timeout - time before disconnecting stream :param timeout - time before disconnecting stream
''' '''
LOG.debug("Wait on %s for %s sec", namespace, timeout)
label_selector = '' label_selector = ''
if labels: if labels:
label_selector = label_selectors(labels) label_selector = label_selectors(labels)
valid_state = ['Succeeded', 'Running'] LOG.debug("Wait on %s (%s) for %s sec", namespace, label_selector,
timeout)
wait_timeout = time.time() + 60 * timeout deadline = time.time() + timeout
while True: # NOTE(mark-burnett): Attempt to wait multiple times without
# modification, in case new pods appear after our watch exits.
self.is_pods_ready(label_selector=label_selector, timeout=timeout) successes = 0
while successes < required_successes:
deadline_remaining = int(deadline - time.time())
if deadline_remaining <= 0:
return False
timed_out, modified_pods, unready_pods = self.wait_one_time(
label_selector, timeout=deadline_remaining)
pod_ready = [] if timed_out:
not_ready = [] LOG.info('Timed out waiting for pods: %s', unready_pods)
for pod in self.client.list_pod_for_all_namespaces( return False
label_selector=label_selector).items:
p_state = pod.status.phase
p_name = pod.metadata.name
if p_state in valid_state:
pod_ready.append(True)
continue
pod_ready.append(False) if modified_pods:
not_ready.append(p_name) successes = 0
LOG.debug('Continuing to wait, found modified pods: %s',
LOG.debug('%s', p_state) modified_pods)
if time.time() > wait_timeout or all(pod_ready):
LOG.debug("Pod States %s", pod_ready)
break
if time.time() > wait_timeout and not all(pod_ready):
LOG.exception(
'Failed to bring up release %s: %s', release, not_ready)
break
else: else:
LOG.debug('time: %s pod %s', wait_timeout, pod_ready) successes += 1
LOG.debug('Found no modified pods this attempt. successes=%d',
successes)
def is_pods_ready(self, label_selector='', timeout=100): time.sleep(inter_success_wait)
'''
:params release_labels - list of labels to identify relevant pods
:params namespace - namespace in which to search for pods
Returns after waiting for all pods to enter Ready state return True
'''
pods_found = []
valid_state = ['Succeeded', 'Running']
def wait_one_time(self, label_selector='', timeout=100):
LOG.debug('Starting to wait: label_selector=%s, timeout=%s',
label_selector, timeout)
ready_pods = {}
modified_pods = set()
w = watch.Watch() w = watch.Watch()
for pod in w.stream(self.client.list_pod_for_all_namespaces, first_event = True
label_selector=label_selector, for event in w.stream(self.client.list_pod_for_all_namespaces,
timeout_seconds=timeout): label_selector=label_selector,
timeout_seconds=timeout):
if first_event:
pod_list = self.client.list_pod_for_all_namespaces(
label_selector=label_selector,
timeout_seconds=timeout)
for pod in pod_list.items:
LOG.debug('Setting up to wait for pod %s',
pod.metadata.name)
ready_pods[pod.metadata.name] = False
first_event = False
pod_name = pod['object'].metadata.name event_type = event['type'].upper()
pod_state = pod['object'].status.phase pod_name = event['object'].metadata.name
if pod['type'] == 'ADDED' and pod_state not in valid_state: if event_type in {'ADDED', 'MODIFIED'}:
LOG.debug("Pod %s in %s", pod_name, pod_state) status = event['object'].status
pods_found.append(pod_name) is_ready = status.phase in READY_PHASES
elif pod_name in pods_found:
if pod_state in valid_state:
pods_found.remove(pod_name)
LOG.debug(pods_found)
if not pods_found: if is_ready:
LOG.debug('Terminate wait') LOG.debug('Pod %s (%s) is_ready=%s', pod_name, event_type,
w.stop() is_ready)
else:
container_statuses = status.container_statuses
conditions = status.conditions
LOG.debug('Pod %s (%s) is_ready=%s container_statuses=%s '
'conditions=%s', pod_name, event_type, is_ready,
container_statuses, conditions)
ready_pods[pod_name] = is_ready
if event_type == 'MODIFIED':
modified_pods.add(pod_name)
elif event_type == 'DELETED':
LOG.debug('Removing pod %s from tracking', pod_name)
ready_pods.pop(pod_name)
elif event_type == 'ERROR':
LOG.error('Got error event for pod: %s',
event['object'].to_dict())
raise exceptions.KubernetesErrorEventException(
'Got error event for pod: %s' % event['object'])
else:
LOG.error('Unrecognized event type (%s) for pod: %s',
event_type, event['object'])
raise exceptions.KubernetesUnknownStreamingEventTypeException(
'Got unknown event type (%s) for pod: %s'
% (event_type, event['object']))
if all(ready_pods.values()):
return (False, modified_pods, [])
# NOTE(mark-burnett): This path is reachable if there are no pods
# (unlikely) or in the case of the watch timing out.
return (not all(ready_pods.values()), modified_pods,
[name for name, ready in ready_pods.items() if not ready])

View File

@ -21,6 +21,7 @@ Armada Exceptions
.. include:: armada-exceptions.inc .. include:: armada-exceptions.inc
.. include:: base-exceptions.inc .. include:: base-exceptions.inc
.. include:: chartbuilder-exceptions.inc .. include:: chartbuilder-exceptions.inc
.. include:: k8s-exceptions.inc
.. include:: lint-exceptions.inc .. include:: lint-exceptions.inc
.. include:: manifest-exceptions.inc .. include:: manifest-exceptions.inc
.. include:: override-exceptions.inc .. include:: override-exceptions.inc

View File

@ -0,0 +1,33 @@
..
Copyright 2018 AT&T Intellectual Property.
All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may
not use this file except in compliance with the License. You may obtain
a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
License for the specific language governing permissions and limitations
under the License.
.. list-table:: **Kubernetes Exceptions**
:widths: 5 50
:header-rows: 1
* - Exception Name
- Description
* - KubernetesErrorEventException
- .. autoexception:: armada.exceptions.k8s_exceptions.KubernetesErrorEventException
:members:
:show-inheritance:
:undoc-members:
* - KubernetesUnknownStreamngEventTypeException
- .. autoexception:: armada.exceptions.k8s_exceptions.KubernetesUnknownStreamngEventTypeException
:members:
:show-inheritance:
:undoc-members: