Fix: Issue where Armada hangs waiting for pods

This also hopefully provides better logging when waiting for pods.

Closes #194

Change-Id: I3704ff004c35c8ecf90555d16e42f15d24284492
This commit is contained in:
Mark Burnett 2018-02-15 11:54:03 -06:00
parent ff2c7d0951
commit e0b04e829b
11 changed files with 180 additions and 59 deletions

View File

@ -102,6 +102,7 @@ class Apply(api.BaseResource):
except exceptions.ManifestException as e:
self.return_error(resp, falcon.HTTP_400, message=str(e))
except Exception as e:
self.logger.exception('Caught unexpected exception')
err_message = 'Failed to apply manifest: {}'.format(e)
self.error(req.context, err_message)
self.return_error(

View File

@ -12,10 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
from oslo_config import cfg
from oslo_log import log as logging
from armada import conf
from armada.exceptions.base_exception import ArmadaBaseException
conf.set_app_default_configs()
conf.set_default_for_default_log_levels()
@ -32,5 +35,15 @@ class CliAction(object):
logging.set_defaults(default_log_levels=CONF.default_log_levels)
logging.setup(CONF, 'armada')
def safe_invoke(self):
try:
self.invoke()
except ArmadaBaseException:
self.logger.exception('Caught internal exception')
sys.exit(1)
except:
self.logger.exception('Caught unexpected exception')
sys.exit(1)
def invoke(self):
raise Exception()
raise NotImplementedError()

View File

@ -133,7 +133,7 @@ def apply_create(ctx, locations, api, disable_update_post, disable_update_pre,
ApplyManifest(ctx, locations, api, disable_update_post, disable_update_pre,
dry_run, enable_chart_cleanup, set, tiller_host, tiller_port,
tiller_namespace, timeout, values, wait,
target_manifest).invoke()
target_manifest).safe_invoke()
class ApplyManifest(CliAction):

View File

@ -73,8 +73,8 @@ SHORT_DESC = "Command deletes releases."
default=44134)
@click.pass_context
def delete_charts(ctx, manifest, releases, no_purge, tiller_host, tiller_port):
DeleteChartManifest(
ctx, manifest, releases, no_purge, tiller_host, tiller_port).invoke()
DeleteChartManifest(ctx, manifest, releases, no_purge, tiller_host,
tiller_port).safe_invoke()
class DeleteChartManifest(CliAction):

View File

@ -82,7 +82,7 @@ def test_charts(ctx, file, release, tiller_host, tiller_port, tiller_namespace,
target_manifest):
TestChartManifest(
ctx, file, release, tiller_host, tiller_port, tiller_namespace,
target_manifest).invoke()
target_manifest).safe_invoke()
class TestChartManifest(CliAction):

View File

@ -71,7 +71,7 @@ SHORT_DESC = "Command gets Tiller information."
def tiller_service(ctx, tiller_host, tiller_port, tiller_namespace, releases,
status):
TillerServices(ctx, tiller_host, tiller_port, tiller_namespace, releases,
status).invoke()
status).safe_invoke()
class TillerServices(CliAction):

View File

@ -48,7 +48,7 @@ SHORT_DESC = "Command validates Armada Manifest."
nargs=-1)
@click.pass_context
def validate_manifest(ctx, locations):
ValidateManifest(ctx, locations).invoke()
ValidateManifest(ctx, locations).safe_invoke()
class ValidateManifest(CliAction):

View File

@ -0,0 +1,33 @@
# Copyright 2018 The Armada Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from armada.exceptions.base_exception import ArmadaBaseException as ex
class KubernetesException(ex):
'''Base class for Kubernetes exceptions and error handling.'''
message = 'An unknown Kubernetes error occured.'
class KubernetesUnknownStreamingEventTypeException(KubernetesException):
'''Exception for getting an unknown event type from the Kubernetes API'''
message = 'An unknown event type was returned from the streaming API.'
class KubernetesErrorEventException(KubernetesException):
'''Exception for getting an error from the Kubernetes API'''
message = 'An error event was returned from the streaming API.'

View File

@ -23,12 +23,15 @@ from oslo_config import cfg
from oslo_log import log as logging
from armada.utils.release import label_selectors
from armada.exceptions import k8s_exceptions as exceptions
LOG = logging.getLogger(__name__)
CONF = cfg.CONF
READY_PHASES = {'Running', 'Succeeded'}
class K8s(object):
'''
@ -207,77 +210,114 @@ class K8s(object):
namespace='default',
labels='',
timeout=300,
sleep=15):
sleep=15,
required_successes=3,
inter_success_wait=10):
'''
:param release - part of namespace
:param timeout - time before disconnecting stream
'''
LOG.debug("Wait on %s for %s sec", namespace, timeout)
label_selector = ''
if labels:
label_selector = label_selectors(labels)
valid_state = ['Succeeded', 'Running']
LOG.debug("Wait on %s (%s) for %s sec", namespace, label_selector,
timeout)
wait_timeout = time.time() + 60 * timeout
deadline = time.time() + timeout
while True:
# NOTE(mark-burnett): Attempt to wait multiple times without
# modification, in case new pods appear after our watch exits.
self.is_pods_ready(label_selector=label_selector, timeout=timeout)
successes = 0
while successes < required_successes:
deadline_remaining = int(deadline - time.time())
if deadline_remaining <= 0:
return False
timed_out, modified_pods, unready_pods = self.wait_one_time(
label_selector, timeout=deadline_remaining)
pod_ready = []
not_ready = []
for pod in self.client.list_pod_for_all_namespaces(
label_selector=label_selector).items:
p_state = pod.status.phase
p_name = pod.metadata.name
if p_state in valid_state:
pod_ready.append(True)
continue
if timed_out:
LOG.info('Timed out waiting for pods: %s', unready_pods)
return False
pod_ready.append(False)
not_ready.append(p_name)
LOG.debug('%s', p_state)
if time.time() > wait_timeout or all(pod_ready):
LOG.debug("Pod States %s", pod_ready)
break
if time.time() > wait_timeout and not all(pod_ready):
LOG.exception(
'Failed to bring up release %s: %s', release, not_ready)
break
if modified_pods:
successes = 0
LOG.debug('Continuing to wait, found modified pods: %s',
modified_pods)
else:
LOG.debug('time: %s pod %s', wait_timeout, pod_ready)
successes += 1
LOG.debug('Found no modified pods this attempt. successes=%d',
successes)
def is_pods_ready(self, label_selector='', timeout=100):
'''
:params release_labels - list of labels to identify relevant pods
:params namespace - namespace in which to search for pods
time.sleep(inter_success_wait)
Returns after waiting for all pods to enter Ready state
'''
pods_found = []
valid_state = ['Succeeded', 'Running']
return True
def wait_one_time(self, label_selector='', timeout=100):
LOG.debug('Starting to wait: label_selector=%s, timeout=%s',
label_selector, timeout)
ready_pods = {}
modified_pods = set()
w = watch.Watch()
for pod in w.stream(self.client.list_pod_for_all_namespaces,
label_selector=label_selector,
timeout_seconds=timeout):
first_event = True
for event in w.stream(self.client.list_pod_for_all_namespaces,
label_selector=label_selector,
timeout_seconds=timeout):
if first_event:
pod_list = self.client.list_pod_for_all_namespaces(
label_selector=label_selector,
timeout_seconds=timeout)
for pod in pod_list.items:
LOG.debug('Setting up to wait for pod %s',
pod.metadata.name)
ready_pods[pod.metadata.name] = False
first_event = False
pod_name = pod['object'].metadata.name
pod_state = pod['object'].status.phase
event_type = event['type'].upper()
pod_name = event['object'].metadata.name
if pod['type'] == 'ADDED' and pod_state not in valid_state:
LOG.debug("Pod %s in %s", pod_name, pod_state)
pods_found.append(pod_name)
elif pod_name in pods_found:
if pod_state in valid_state:
pods_found.remove(pod_name)
LOG.debug(pods_found)
if event_type in {'ADDED', 'MODIFIED'}:
status = event['object'].status
is_ready = status.phase in READY_PHASES
if not pods_found:
LOG.debug('Terminate wait')
w.stop()
if is_ready:
LOG.debug('Pod %s (%s) is_ready=%s', pod_name, event_type,
is_ready)
else:
container_statuses = status.container_statuses
conditions = status.conditions
LOG.debug('Pod %s (%s) is_ready=%s container_statuses=%s '
'conditions=%s', pod_name, event_type, is_ready,
container_statuses, conditions)
ready_pods[pod_name] = is_ready
if event_type == 'MODIFIED':
modified_pods.add(pod_name)
elif event_type == 'DELETED':
LOG.debug('Removing pod %s from tracking', pod_name)
ready_pods.pop(pod_name)
elif event_type == 'ERROR':
LOG.error('Got error event for pod: %s',
event['object'].to_dict())
raise exceptions.KubernetesErrorEventException(
'Got error event for pod: %s' % event['object'])
else:
LOG.error('Unrecognized event type (%s) for pod: %s',
event_type, event['object'])
raise exceptions.KubernetesUnknownStreamingEventTypeException(
'Got unknown event type (%s) for pod: %s'
% (event_type, event['object']))
if all(ready_pods.values()):
return (False, modified_pods, [])
# NOTE(mark-burnett): This path is reachable if there are no pods
# (unlikely) or in the case of the watch timing out.
return (not all(ready_pods.values()), modified_pods,
[name for name, ready in ready_pods.items() if not ready])

View File

@ -21,6 +21,7 @@ Armada Exceptions
.. include:: armada-exceptions.inc
.. include:: base-exceptions.inc
.. include:: chartbuilder-exceptions.inc
.. include:: k8s-exceptions.inc
.. include:: lint-exceptions.inc
.. include:: manifest-exceptions.inc
.. include:: override-exceptions.inc

View File

@ -0,0 +1,33 @@
..
Copyright 2018 AT&T Intellectual Property.
All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may
not use this file except in compliance with the License. You may obtain
a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
License for the specific language governing permissions and limitations
under the License.
.. list-table:: **Kubernetes Exceptions**
:widths: 5 50
:header-rows: 1
* - Exception Name
- Description
* - KubernetesErrorEventException
- .. autoexception:: armada.exceptions.k8s_exceptions.KubernetesErrorEventException
:members:
:show-inheritance:
:undoc-members:
* - KubernetesUnknownStreamngEventTypeException
- .. autoexception:: armada.exceptions.k8s_exceptions.KubernetesUnknownStreamngEventTypeException
:members:
:show-inheritance:
:undoc-members: