Fix: Issue where Armada hangs waiting for pods
This also hopefully provides better logging when waiting for pods. Closes #194 Change-Id: I3704ff004c35c8ecf90555d16e42f15d24284492
This commit is contained in:
parent
ff2c7d0951
commit
e0b04e829b
|
@ -102,6 +102,7 @@ class Apply(api.BaseResource):
|
|||
except exceptions.ManifestException as e:
|
||||
self.return_error(resp, falcon.HTTP_400, message=str(e))
|
||||
except Exception as e:
|
||||
self.logger.exception('Caught unexpected exception')
|
||||
err_message = 'Failed to apply manifest: {}'.format(e)
|
||||
self.error(req.context, err_message)
|
||||
self.return_error(
|
||||
|
|
|
@ -12,10 +12,13 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import sys
|
||||
|
||||
from oslo_config import cfg
|
||||
from oslo_log import log as logging
|
||||
|
||||
from armada import conf
|
||||
from armada.exceptions.base_exception import ArmadaBaseException
|
||||
|
||||
conf.set_app_default_configs()
|
||||
conf.set_default_for_default_log_levels()
|
||||
|
@ -32,5 +35,15 @@ class CliAction(object):
|
|||
logging.set_defaults(default_log_levels=CONF.default_log_levels)
|
||||
logging.setup(CONF, 'armada')
|
||||
|
||||
def safe_invoke(self):
|
||||
try:
|
||||
self.invoke()
|
||||
except ArmadaBaseException:
|
||||
self.logger.exception('Caught internal exception')
|
||||
sys.exit(1)
|
||||
except:
|
||||
self.logger.exception('Caught unexpected exception')
|
||||
sys.exit(1)
|
||||
|
||||
def invoke(self):
|
||||
raise Exception()
|
||||
raise NotImplementedError()
|
||||
|
|
|
@ -133,7 +133,7 @@ def apply_create(ctx, locations, api, disable_update_post, disable_update_pre,
|
|||
ApplyManifest(ctx, locations, api, disable_update_post, disable_update_pre,
|
||||
dry_run, enable_chart_cleanup, set, tiller_host, tiller_port,
|
||||
tiller_namespace, timeout, values, wait,
|
||||
target_manifest).invoke()
|
||||
target_manifest).safe_invoke()
|
||||
|
||||
|
||||
class ApplyManifest(CliAction):
|
||||
|
|
|
@ -73,8 +73,8 @@ SHORT_DESC = "Command deletes releases."
|
|||
default=44134)
|
||||
@click.pass_context
|
||||
def delete_charts(ctx, manifest, releases, no_purge, tiller_host, tiller_port):
|
||||
DeleteChartManifest(
|
||||
ctx, manifest, releases, no_purge, tiller_host, tiller_port).invoke()
|
||||
DeleteChartManifest(ctx, manifest, releases, no_purge, tiller_host,
|
||||
tiller_port).safe_invoke()
|
||||
|
||||
|
||||
class DeleteChartManifest(CliAction):
|
||||
|
|
|
@ -82,7 +82,7 @@ def test_charts(ctx, file, release, tiller_host, tiller_port, tiller_namespace,
|
|||
target_manifest):
|
||||
TestChartManifest(
|
||||
ctx, file, release, tiller_host, tiller_port, tiller_namespace,
|
||||
target_manifest).invoke()
|
||||
target_manifest).safe_invoke()
|
||||
|
||||
|
||||
class TestChartManifest(CliAction):
|
||||
|
|
|
@ -71,7 +71,7 @@ SHORT_DESC = "Command gets Tiller information."
|
|||
def tiller_service(ctx, tiller_host, tiller_port, tiller_namespace, releases,
|
||||
status):
|
||||
TillerServices(ctx, tiller_host, tiller_port, tiller_namespace, releases,
|
||||
status).invoke()
|
||||
status).safe_invoke()
|
||||
|
||||
|
||||
class TillerServices(CliAction):
|
||||
|
|
|
@ -48,7 +48,7 @@ SHORT_DESC = "Command validates Armada Manifest."
|
|||
nargs=-1)
|
||||
@click.pass_context
|
||||
def validate_manifest(ctx, locations):
|
||||
ValidateManifest(ctx, locations).invoke()
|
||||
ValidateManifest(ctx, locations).safe_invoke()
|
||||
|
||||
|
||||
class ValidateManifest(CliAction):
|
||||
|
|
|
@ -0,0 +1,33 @@
|
|||
# Copyright 2018 The Armada Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from armada.exceptions.base_exception import ArmadaBaseException as ex
|
||||
|
||||
|
||||
class KubernetesException(ex):
|
||||
'''Base class for Kubernetes exceptions and error handling.'''
|
||||
|
||||
message = 'An unknown Kubernetes error occured.'
|
||||
|
||||
|
||||
class KubernetesUnknownStreamingEventTypeException(KubernetesException):
|
||||
'''Exception for getting an unknown event type from the Kubernetes API'''
|
||||
|
||||
message = 'An unknown event type was returned from the streaming API.'
|
||||
|
||||
|
||||
class KubernetesErrorEventException(KubernetesException):
|
||||
'''Exception for getting an error from the Kubernetes API'''
|
||||
|
||||
message = 'An error event was returned from the streaming API.'
|
|
@ -23,12 +23,15 @@ from oslo_config import cfg
|
|||
from oslo_log import log as logging
|
||||
|
||||
from armada.utils.release import label_selectors
|
||||
from armada.exceptions import k8s_exceptions as exceptions
|
||||
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
CONF = cfg.CONF
|
||||
|
||||
READY_PHASES = {'Running', 'Succeeded'}
|
||||
|
||||
|
||||
class K8s(object):
|
||||
'''
|
||||
|
@ -207,77 +210,114 @@ class K8s(object):
|
|||
namespace='default',
|
||||
labels='',
|
||||
timeout=300,
|
||||
sleep=15):
|
||||
sleep=15,
|
||||
required_successes=3,
|
||||
inter_success_wait=10):
|
||||
'''
|
||||
:param release - part of namespace
|
||||
:param timeout - time before disconnecting stream
|
||||
'''
|
||||
LOG.debug("Wait on %s for %s sec", namespace, timeout)
|
||||
|
||||
label_selector = ''
|
||||
|
||||
if labels:
|
||||
label_selector = label_selectors(labels)
|
||||
|
||||
valid_state = ['Succeeded', 'Running']
|
||||
LOG.debug("Wait on %s (%s) for %s sec", namespace, label_selector,
|
||||
timeout)
|
||||
|
||||
wait_timeout = time.time() + 60 * timeout
|
||||
deadline = time.time() + timeout
|
||||
|
||||
while True:
|
||||
# NOTE(mark-burnett): Attempt to wait multiple times without
|
||||
# modification, in case new pods appear after our watch exits.
|
||||
|
||||
self.is_pods_ready(label_selector=label_selector, timeout=timeout)
|
||||
successes = 0
|
||||
while successes < required_successes:
|
||||
deadline_remaining = int(deadline - time.time())
|
||||
if deadline_remaining <= 0:
|
||||
return False
|
||||
timed_out, modified_pods, unready_pods = self.wait_one_time(
|
||||
label_selector, timeout=deadline_remaining)
|
||||
|
||||
pod_ready = []
|
||||
not_ready = []
|
||||
for pod in self.client.list_pod_for_all_namespaces(
|
||||
label_selector=label_selector).items:
|
||||
p_state = pod.status.phase
|
||||
p_name = pod.metadata.name
|
||||
if p_state in valid_state:
|
||||
pod_ready.append(True)
|
||||
continue
|
||||
if timed_out:
|
||||
LOG.info('Timed out waiting for pods: %s', unready_pods)
|
||||
return False
|
||||
|
||||
pod_ready.append(False)
|
||||
not_ready.append(p_name)
|
||||
|
||||
LOG.debug('%s', p_state)
|
||||
|
||||
if time.time() > wait_timeout or all(pod_ready):
|
||||
LOG.debug("Pod States %s", pod_ready)
|
||||
break
|
||||
if time.time() > wait_timeout and not all(pod_ready):
|
||||
LOG.exception(
|
||||
'Failed to bring up release %s: %s', release, not_ready)
|
||||
break
|
||||
if modified_pods:
|
||||
successes = 0
|
||||
LOG.debug('Continuing to wait, found modified pods: %s',
|
||||
modified_pods)
|
||||
else:
|
||||
LOG.debug('time: %s pod %s', wait_timeout, pod_ready)
|
||||
successes += 1
|
||||
LOG.debug('Found no modified pods this attempt. successes=%d',
|
||||
successes)
|
||||
|
||||
def is_pods_ready(self, label_selector='', timeout=100):
|
||||
'''
|
||||
:params release_labels - list of labels to identify relevant pods
|
||||
:params namespace - namespace in which to search for pods
|
||||
time.sleep(inter_success_wait)
|
||||
|
||||
Returns after waiting for all pods to enter Ready state
|
||||
'''
|
||||
pods_found = []
|
||||
valid_state = ['Succeeded', 'Running']
|
||||
return True
|
||||
|
||||
def wait_one_time(self, label_selector='', timeout=100):
|
||||
LOG.debug('Starting to wait: label_selector=%s, timeout=%s',
|
||||
label_selector, timeout)
|
||||
ready_pods = {}
|
||||
modified_pods = set()
|
||||
w = watch.Watch()
|
||||
for pod in w.stream(self.client.list_pod_for_all_namespaces,
|
||||
first_event = True
|
||||
for event in w.stream(self.client.list_pod_for_all_namespaces,
|
||||
label_selector=label_selector,
|
||||
timeout_seconds=timeout):
|
||||
if first_event:
|
||||
pod_list = self.client.list_pod_for_all_namespaces(
|
||||
label_selector=label_selector,
|
||||
timeout_seconds=timeout)
|
||||
for pod in pod_list.items:
|
||||
LOG.debug('Setting up to wait for pod %s',
|
||||
pod.metadata.name)
|
||||
ready_pods[pod.metadata.name] = False
|
||||
first_event = False
|
||||
|
||||
pod_name = pod['object'].metadata.name
|
||||
pod_state = pod['object'].status.phase
|
||||
event_type = event['type'].upper()
|
||||
pod_name = event['object'].metadata.name
|
||||
|
||||
if pod['type'] == 'ADDED' and pod_state not in valid_state:
|
||||
LOG.debug("Pod %s in %s", pod_name, pod_state)
|
||||
pods_found.append(pod_name)
|
||||
elif pod_name in pods_found:
|
||||
if pod_state in valid_state:
|
||||
pods_found.remove(pod_name)
|
||||
LOG.debug(pods_found)
|
||||
if event_type in {'ADDED', 'MODIFIED'}:
|
||||
status = event['object'].status
|
||||
is_ready = status.phase in READY_PHASES
|
||||
|
||||
if not pods_found:
|
||||
LOG.debug('Terminate wait')
|
||||
w.stop()
|
||||
if is_ready:
|
||||
LOG.debug('Pod %s (%s) is_ready=%s', pod_name, event_type,
|
||||
is_ready)
|
||||
else:
|
||||
container_statuses = status.container_statuses
|
||||
conditions = status.conditions
|
||||
LOG.debug('Pod %s (%s) is_ready=%s container_statuses=%s '
|
||||
'conditions=%s', pod_name, event_type, is_ready,
|
||||
container_statuses, conditions)
|
||||
|
||||
ready_pods[pod_name] = is_ready
|
||||
|
||||
if event_type == 'MODIFIED':
|
||||
modified_pods.add(pod_name)
|
||||
|
||||
elif event_type == 'DELETED':
|
||||
LOG.debug('Removing pod %s from tracking', pod_name)
|
||||
ready_pods.pop(pod_name)
|
||||
|
||||
elif event_type == 'ERROR':
|
||||
LOG.error('Got error event for pod: %s',
|
||||
event['object'].to_dict())
|
||||
raise exceptions.KubernetesErrorEventException(
|
||||
'Got error event for pod: %s' % event['object'])
|
||||
|
||||
else:
|
||||
LOG.error('Unrecognized event type (%s) for pod: %s',
|
||||
event_type, event['object'])
|
||||
raise exceptions.KubernetesUnknownStreamingEventTypeException(
|
||||
'Got unknown event type (%s) for pod: %s'
|
||||
% (event_type, event['object']))
|
||||
|
||||
if all(ready_pods.values()):
|
||||
return (False, modified_pods, [])
|
||||
|
||||
# NOTE(mark-burnett): This path is reachable if there are no pods
|
||||
# (unlikely) or in the case of the watch timing out.
|
||||
return (not all(ready_pods.values()), modified_pods,
|
||||
[name for name, ready in ready_pods.items() if not ready])
|
||||
|
|
|
@ -21,6 +21,7 @@ Armada Exceptions
|
|||
.. include:: armada-exceptions.inc
|
||||
.. include:: base-exceptions.inc
|
||||
.. include:: chartbuilder-exceptions.inc
|
||||
.. include:: k8s-exceptions.inc
|
||||
.. include:: lint-exceptions.inc
|
||||
.. include:: manifest-exceptions.inc
|
||||
.. include:: override-exceptions.inc
|
||||
|
|
|
@ -0,0 +1,33 @@
|
|||
..
|
||||
Copyright 2018 AT&T Intellectual Property.
|
||||
All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
not use this file except in compliance with the License. You may obtain
|
||||
a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
License for the specific language governing permissions and limitations
|
||||
under the License.
|
||||
|
||||
|
||||
.. list-table:: **Kubernetes Exceptions**
|
||||
:widths: 5 50
|
||||
:header-rows: 1
|
||||
|
||||
* - Exception Name
|
||||
- Description
|
||||
* - KubernetesErrorEventException
|
||||
- .. autoexception:: armada.exceptions.k8s_exceptions.KubernetesErrorEventException
|
||||
:members:
|
||||
:show-inheritance:
|
||||
:undoc-members:
|
||||
* - KubernetesUnknownStreamngEventTypeException
|
||||
- .. autoexception:: armada.exceptions.k8s_exceptions.KubernetesUnknownStreamngEventTypeException
|
||||
:members:
|
||||
:show-inheritance:
|
||||
:undoc-members:
|
Loading…
Reference in New Issue