Fix: Issue where Armada hangs waiting for pods
This also hopefully provides better logging when waiting for pods. Closes #194 Change-Id: I3704ff004c35c8ecf90555d16e42f15d24284492
This commit is contained in:
parent
ff2c7d0951
commit
e0b04e829b
|
@ -102,6 +102,7 @@ class Apply(api.BaseResource):
|
||||||
except exceptions.ManifestException as e:
|
except exceptions.ManifestException as e:
|
||||||
self.return_error(resp, falcon.HTTP_400, message=str(e))
|
self.return_error(resp, falcon.HTTP_400, message=str(e))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
self.logger.exception('Caught unexpected exception')
|
||||||
err_message = 'Failed to apply manifest: {}'.format(e)
|
err_message = 'Failed to apply manifest: {}'.format(e)
|
||||||
self.error(req.context, err_message)
|
self.error(req.context, err_message)
|
||||||
self.return_error(
|
self.return_error(
|
||||||
|
|
|
@ -12,10 +12,13 @@
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
from oslo_config import cfg
|
from oslo_config import cfg
|
||||||
from oslo_log import log as logging
|
from oslo_log import log as logging
|
||||||
|
|
||||||
from armada import conf
|
from armada import conf
|
||||||
|
from armada.exceptions.base_exception import ArmadaBaseException
|
||||||
|
|
||||||
conf.set_app_default_configs()
|
conf.set_app_default_configs()
|
||||||
conf.set_default_for_default_log_levels()
|
conf.set_default_for_default_log_levels()
|
||||||
|
@ -32,5 +35,15 @@ class CliAction(object):
|
||||||
logging.set_defaults(default_log_levels=CONF.default_log_levels)
|
logging.set_defaults(default_log_levels=CONF.default_log_levels)
|
||||||
logging.setup(CONF, 'armada')
|
logging.setup(CONF, 'armada')
|
||||||
|
|
||||||
|
def safe_invoke(self):
|
||||||
|
try:
|
||||||
|
self.invoke()
|
||||||
|
except ArmadaBaseException:
|
||||||
|
self.logger.exception('Caught internal exception')
|
||||||
|
sys.exit(1)
|
||||||
|
except:
|
||||||
|
self.logger.exception('Caught unexpected exception')
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
def invoke(self):
|
def invoke(self):
|
||||||
raise Exception()
|
raise NotImplementedError()
|
||||||
|
|
|
@ -133,7 +133,7 @@ def apply_create(ctx, locations, api, disable_update_post, disable_update_pre,
|
||||||
ApplyManifest(ctx, locations, api, disable_update_post, disable_update_pre,
|
ApplyManifest(ctx, locations, api, disable_update_post, disable_update_pre,
|
||||||
dry_run, enable_chart_cleanup, set, tiller_host, tiller_port,
|
dry_run, enable_chart_cleanup, set, tiller_host, tiller_port,
|
||||||
tiller_namespace, timeout, values, wait,
|
tiller_namespace, timeout, values, wait,
|
||||||
target_manifest).invoke()
|
target_manifest).safe_invoke()
|
||||||
|
|
||||||
|
|
||||||
class ApplyManifest(CliAction):
|
class ApplyManifest(CliAction):
|
||||||
|
|
|
@ -73,8 +73,8 @@ SHORT_DESC = "Command deletes releases."
|
||||||
default=44134)
|
default=44134)
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def delete_charts(ctx, manifest, releases, no_purge, tiller_host, tiller_port):
|
def delete_charts(ctx, manifest, releases, no_purge, tiller_host, tiller_port):
|
||||||
DeleteChartManifest(
|
DeleteChartManifest(ctx, manifest, releases, no_purge, tiller_host,
|
||||||
ctx, manifest, releases, no_purge, tiller_host, tiller_port).invoke()
|
tiller_port).safe_invoke()
|
||||||
|
|
||||||
|
|
||||||
class DeleteChartManifest(CliAction):
|
class DeleteChartManifest(CliAction):
|
||||||
|
|
|
@ -82,7 +82,7 @@ def test_charts(ctx, file, release, tiller_host, tiller_port, tiller_namespace,
|
||||||
target_manifest):
|
target_manifest):
|
||||||
TestChartManifest(
|
TestChartManifest(
|
||||||
ctx, file, release, tiller_host, tiller_port, tiller_namespace,
|
ctx, file, release, tiller_host, tiller_port, tiller_namespace,
|
||||||
target_manifest).invoke()
|
target_manifest).safe_invoke()
|
||||||
|
|
||||||
|
|
||||||
class TestChartManifest(CliAction):
|
class TestChartManifest(CliAction):
|
||||||
|
|
|
@ -71,7 +71,7 @@ SHORT_DESC = "Command gets Tiller information."
|
||||||
def tiller_service(ctx, tiller_host, tiller_port, tiller_namespace, releases,
|
def tiller_service(ctx, tiller_host, tiller_port, tiller_namespace, releases,
|
||||||
status):
|
status):
|
||||||
TillerServices(ctx, tiller_host, tiller_port, tiller_namespace, releases,
|
TillerServices(ctx, tiller_host, tiller_port, tiller_namespace, releases,
|
||||||
status).invoke()
|
status).safe_invoke()
|
||||||
|
|
||||||
|
|
||||||
class TillerServices(CliAction):
|
class TillerServices(CliAction):
|
||||||
|
|
|
@ -48,7 +48,7 @@ SHORT_DESC = "Command validates Armada Manifest."
|
||||||
nargs=-1)
|
nargs=-1)
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def validate_manifest(ctx, locations):
|
def validate_manifest(ctx, locations):
|
||||||
ValidateManifest(ctx, locations).invoke()
|
ValidateManifest(ctx, locations).safe_invoke()
|
||||||
|
|
||||||
|
|
||||||
class ValidateManifest(CliAction):
|
class ValidateManifest(CliAction):
|
||||||
|
|
|
@ -0,0 +1,33 @@
|
||||||
|
# Copyright 2018 The Armada Authors.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
from armada.exceptions.base_exception import ArmadaBaseException as ex
|
||||||
|
|
||||||
|
|
||||||
|
class KubernetesException(ex):
|
||||||
|
'''Base class for Kubernetes exceptions and error handling.'''
|
||||||
|
|
||||||
|
message = 'An unknown Kubernetes error occured.'
|
||||||
|
|
||||||
|
|
||||||
|
class KubernetesUnknownStreamingEventTypeException(KubernetesException):
|
||||||
|
'''Exception for getting an unknown event type from the Kubernetes API'''
|
||||||
|
|
||||||
|
message = 'An unknown event type was returned from the streaming API.'
|
||||||
|
|
||||||
|
|
||||||
|
class KubernetesErrorEventException(KubernetesException):
|
||||||
|
'''Exception for getting an error from the Kubernetes API'''
|
||||||
|
|
||||||
|
message = 'An error event was returned from the streaming API.'
|
|
@ -23,12 +23,15 @@ from oslo_config import cfg
|
||||||
from oslo_log import log as logging
|
from oslo_log import log as logging
|
||||||
|
|
||||||
from armada.utils.release import label_selectors
|
from armada.utils.release import label_selectors
|
||||||
|
from armada.exceptions import k8s_exceptions as exceptions
|
||||||
|
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
CONF = cfg.CONF
|
CONF = cfg.CONF
|
||||||
|
|
||||||
|
READY_PHASES = {'Running', 'Succeeded'}
|
||||||
|
|
||||||
|
|
||||||
class K8s(object):
|
class K8s(object):
|
||||||
'''
|
'''
|
||||||
|
@ -207,77 +210,114 @@ class K8s(object):
|
||||||
namespace='default',
|
namespace='default',
|
||||||
labels='',
|
labels='',
|
||||||
timeout=300,
|
timeout=300,
|
||||||
sleep=15):
|
sleep=15,
|
||||||
|
required_successes=3,
|
||||||
|
inter_success_wait=10):
|
||||||
'''
|
'''
|
||||||
:param release - part of namespace
|
:param release - part of namespace
|
||||||
:param timeout - time before disconnecting stream
|
:param timeout - time before disconnecting stream
|
||||||
'''
|
'''
|
||||||
LOG.debug("Wait on %s for %s sec", namespace, timeout)
|
|
||||||
|
|
||||||
label_selector = ''
|
label_selector = ''
|
||||||
|
|
||||||
if labels:
|
if labels:
|
||||||
label_selector = label_selectors(labels)
|
label_selector = label_selectors(labels)
|
||||||
|
|
||||||
valid_state = ['Succeeded', 'Running']
|
LOG.debug("Wait on %s (%s) for %s sec", namespace, label_selector,
|
||||||
|
timeout)
|
||||||
|
|
||||||
wait_timeout = time.time() + 60 * timeout
|
deadline = time.time() + timeout
|
||||||
|
|
||||||
while True:
|
# NOTE(mark-burnett): Attempt to wait multiple times without
|
||||||
|
# modification, in case new pods appear after our watch exits.
|
||||||
|
|
||||||
self.is_pods_ready(label_selector=label_selector, timeout=timeout)
|
successes = 0
|
||||||
|
while successes < required_successes:
|
||||||
|
deadline_remaining = int(deadline - time.time())
|
||||||
|
if deadline_remaining <= 0:
|
||||||
|
return False
|
||||||
|
timed_out, modified_pods, unready_pods = self.wait_one_time(
|
||||||
|
label_selector, timeout=deadline_remaining)
|
||||||
|
|
||||||
pod_ready = []
|
if timed_out:
|
||||||
not_ready = []
|
LOG.info('Timed out waiting for pods: %s', unready_pods)
|
||||||
for pod in self.client.list_pod_for_all_namespaces(
|
return False
|
||||||
label_selector=label_selector).items:
|
|
||||||
p_state = pod.status.phase
|
|
||||||
p_name = pod.metadata.name
|
|
||||||
if p_state in valid_state:
|
|
||||||
pod_ready.append(True)
|
|
||||||
continue
|
|
||||||
|
|
||||||
pod_ready.append(False)
|
if modified_pods:
|
||||||
not_ready.append(p_name)
|
successes = 0
|
||||||
|
LOG.debug('Continuing to wait, found modified pods: %s',
|
||||||
LOG.debug('%s', p_state)
|
modified_pods)
|
||||||
|
|
||||||
if time.time() > wait_timeout or all(pod_ready):
|
|
||||||
LOG.debug("Pod States %s", pod_ready)
|
|
||||||
break
|
|
||||||
if time.time() > wait_timeout and not all(pod_ready):
|
|
||||||
LOG.exception(
|
|
||||||
'Failed to bring up release %s: %s', release, not_ready)
|
|
||||||
break
|
|
||||||
else:
|
else:
|
||||||
LOG.debug('time: %s pod %s', wait_timeout, pod_ready)
|
successes += 1
|
||||||
|
LOG.debug('Found no modified pods this attempt. successes=%d',
|
||||||
|
successes)
|
||||||
|
|
||||||
def is_pods_ready(self, label_selector='', timeout=100):
|
time.sleep(inter_success_wait)
|
||||||
'''
|
|
||||||
:params release_labels - list of labels to identify relevant pods
|
|
||||||
:params namespace - namespace in which to search for pods
|
|
||||||
|
|
||||||
Returns after waiting for all pods to enter Ready state
|
return True
|
||||||
'''
|
|
||||||
pods_found = []
|
|
||||||
valid_state = ['Succeeded', 'Running']
|
|
||||||
|
|
||||||
|
def wait_one_time(self, label_selector='', timeout=100):
|
||||||
|
LOG.debug('Starting to wait: label_selector=%s, timeout=%s',
|
||||||
|
label_selector, timeout)
|
||||||
|
ready_pods = {}
|
||||||
|
modified_pods = set()
|
||||||
w = watch.Watch()
|
w = watch.Watch()
|
||||||
for pod in w.stream(self.client.list_pod_for_all_namespaces,
|
first_event = True
|
||||||
label_selector=label_selector,
|
for event in w.stream(self.client.list_pod_for_all_namespaces,
|
||||||
timeout_seconds=timeout):
|
label_selector=label_selector,
|
||||||
|
timeout_seconds=timeout):
|
||||||
|
if first_event:
|
||||||
|
pod_list = self.client.list_pod_for_all_namespaces(
|
||||||
|
label_selector=label_selector,
|
||||||
|
timeout_seconds=timeout)
|
||||||
|
for pod in pod_list.items:
|
||||||
|
LOG.debug('Setting up to wait for pod %s',
|
||||||
|
pod.metadata.name)
|
||||||
|
ready_pods[pod.metadata.name] = False
|
||||||
|
first_event = False
|
||||||
|
|
||||||
pod_name = pod['object'].metadata.name
|
event_type = event['type'].upper()
|
||||||
pod_state = pod['object'].status.phase
|
pod_name = event['object'].metadata.name
|
||||||
|
|
||||||
if pod['type'] == 'ADDED' and pod_state not in valid_state:
|
if event_type in {'ADDED', 'MODIFIED'}:
|
||||||
LOG.debug("Pod %s in %s", pod_name, pod_state)
|
status = event['object'].status
|
||||||
pods_found.append(pod_name)
|
is_ready = status.phase in READY_PHASES
|
||||||
elif pod_name in pods_found:
|
|
||||||
if pod_state in valid_state:
|
|
||||||
pods_found.remove(pod_name)
|
|
||||||
LOG.debug(pods_found)
|
|
||||||
|
|
||||||
if not pods_found:
|
if is_ready:
|
||||||
LOG.debug('Terminate wait')
|
LOG.debug('Pod %s (%s) is_ready=%s', pod_name, event_type,
|
||||||
w.stop()
|
is_ready)
|
||||||
|
else:
|
||||||
|
container_statuses = status.container_statuses
|
||||||
|
conditions = status.conditions
|
||||||
|
LOG.debug('Pod %s (%s) is_ready=%s container_statuses=%s '
|
||||||
|
'conditions=%s', pod_name, event_type, is_ready,
|
||||||
|
container_statuses, conditions)
|
||||||
|
|
||||||
|
ready_pods[pod_name] = is_ready
|
||||||
|
|
||||||
|
if event_type == 'MODIFIED':
|
||||||
|
modified_pods.add(pod_name)
|
||||||
|
|
||||||
|
elif event_type == 'DELETED':
|
||||||
|
LOG.debug('Removing pod %s from tracking', pod_name)
|
||||||
|
ready_pods.pop(pod_name)
|
||||||
|
|
||||||
|
elif event_type == 'ERROR':
|
||||||
|
LOG.error('Got error event for pod: %s',
|
||||||
|
event['object'].to_dict())
|
||||||
|
raise exceptions.KubernetesErrorEventException(
|
||||||
|
'Got error event for pod: %s' % event['object'])
|
||||||
|
|
||||||
|
else:
|
||||||
|
LOG.error('Unrecognized event type (%s) for pod: %s',
|
||||||
|
event_type, event['object'])
|
||||||
|
raise exceptions.KubernetesUnknownStreamingEventTypeException(
|
||||||
|
'Got unknown event type (%s) for pod: %s'
|
||||||
|
% (event_type, event['object']))
|
||||||
|
|
||||||
|
if all(ready_pods.values()):
|
||||||
|
return (False, modified_pods, [])
|
||||||
|
|
||||||
|
# NOTE(mark-burnett): This path is reachable if there are no pods
|
||||||
|
# (unlikely) or in the case of the watch timing out.
|
||||||
|
return (not all(ready_pods.values()), modified_pods,
|
||||||
|
[name for name, ready in ready_pods.items() if not ready])
|
||||||
|
|
|
@ -21,6 +21,7 @@ Armada Exceptions
|
||||||
.. include:: armada-exceptions.inc
|
.. include:: armada-exceptions.inc
|
||||||
.. include:: base-exceptions.inc
|
.. include:: base-exceptions.inc
|
||||||
.. include:: chartbuilder-exceptions.inc
|
.. include:: chartbuilder-exceptions.inc
|
||||||
|
.. include:: k8s-exceptions.inc
|
||||||
.. include:: lint-exceptions.inc
|
.. include:: lint-exceptions.inc
|
||||||
.. include:: manifest-exceptions.inc
|
.. include:: manifest-exceptions.inc
|
||||||
.. include:: override-exceptions.inc
|
.. include:: override-exceptions.inc
|
||||||
|
|
|
@ -0,0 +1,33 @@
|
||||||
|
..
|
||||||
|
Copyright 2018 AT&T Intellectual Property.
|
||||||
|
All Rights Reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
not use this file except in compliance with the License. You may obtain
|
||||||
|
a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||||
|
WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||||
|
License for the specific language governing permissions and limitations
|
||||||
|
under the License.
|
||||||
|
|
||||||
|
|
||||||
|
.. list-table:: **Kubernetes Exceptions**
|
||||||
|
:widths: 5 50
|
||||||
|
:header-rows: 1
|
||||||
|
|
||||||
|
* - Exception Name
|
||||||
|
- Description
|
||||||
|
* - KubernetesErrorEventException
|
||||||
|
- .. autoexception:: armada.exceptions.k8s_exceptions.KubernetesErrorEventException
|
||||||
|
:members:
|
||||||
|
:show-inheritance:
|
||||||
|
:undoc-members:
|
||||||
|
* - KubernetesUnknownStreamngEventTypeException
|
||||||
|
- .. autoexception:: armada.exceptions.k8s_exceptions.KubernetesUnknownStreamngEventTypeException
|
||||||
|
:members:
|
||||||
|
:show-inheritance:
|
||||||
|
:undoc-members:
|
Loading…
Reference in New Issue