fix(timeouts): Address timeout handling issues

- fixing wait handling in multiple areas
      -- wait for deleted pods before continuing Apply update
      -- cleaning up and delineating wait for charts vs chartgroups
      -- timeout exceptions to stop execution
    - api/cli 'timeout' param now applies to all Charts
    - api/cli 'wait' param now applies to all Charts
    - update some docs
    - several TODOs to be addressed in future PS

Closes #199

Change-Id: I5a697508ce6027e9182f3f1f61757319a3ed3593
This commit is contained in:
Marshall Margenau 2018-04-19 11:05:59 -05:00
parent b5d5c2dfd5
commit dc508d5012
12 changed files with 373 additions and 194 deletions

View File

@ -80,8 +80,8 @@ class Apply(api.BaseResource):
enable_chart_cleanup=req.get_param_as_bool(
'enable_chart_cleanup'),
dry_run=req.get_param_as_bool('dry_run'),
tiller_should_wait=req.get_param_as_bool('wait'),
tiller_timeout=req.get_param_as_int('timeout') or 3600,
force_wait=req.get_param_as_bool('wait'),
timeout=req.get_param_as_int('timeout') or 0,
tiller_host=req.get_param('tiller_host'),
tiller_port=req.get_param_as_int(
'tiller_port') or CONF.tiller_port,

View File

@ -106,9 +106,10 @@ SHORT_DESC = "Command installs manifest charts."
type=str,
default=CONF.tiller_namespace)
@click.option('--timeout',
help="Specifies time to wait for charts to deploy.",
help="Specifies time to wait for each chart to fully "
"finish deploying.",
type=int,
default=3600)
default=0)
@click.option('--values', '-f',
help=("Use to override multiple Armada Manifest values by "
"reading overrides from a values.yaml-type file."),
@ -116,7 +117,9 @@ SHORT_DESC = "Command installs manifest charts."
type=str,
default=[])
@click.option('--wait',
help="Wait until all charts deployed.",
help=("Force Tiller to wait until all charts are deployed, "
"rather than using each chart's specified wait policy. "
"This is equivalent to sequenced chartgroups."),
is_flag=True)
@click.option('--target-manifest',
help=("The target manifest to run. Required for specifying "
@ -207,8 +210,8 @@ class ApplyManifest(CliAction):
enable_chart_cleanup=self.enable_chart_cleanup,
dry_run=self.dry_run,
set_ovr=self.set,
tiller_should_wait=self.wait,
tiller_timeout=self.timeout,
force_wait=self.wait,
timeout=self.timeout,
tiller_host=self.tiller_host,
tiller_port=self.tiller_port,
tiller_namespace=self.tiller_namespace,

View File

@ -23,11 +23,14 @@ KEYWORD_PREFIX = 'release_prefix'
KEYWORD_GROUPS = 'chart_groups'
KEYWORD_CHARTS = 'chart_group'
KEYWORD_RELEASE = 'release'
KEYWORD_CHART = 'chart'
# Statuses
# Tiller
DEFAULT_CHART_TIMEOUT = 3600
STATUS_DEPLOYED = 'DEPLOYED'
STATUS_FAILED = 'FAILED'
# Kubernetes
DEFAULT_K8S_TIMEOUT = 300
# Configuration File
CONFIG_PATH = '/etc/armada'

View File

@ -21,6 +21,14 @@ class ArmadaException(base_exception.ArmadaBaseException):
message = 'An unknown Armada handler error occurred.'
class ArmadaTimeoutException(ArmadaException):
'''Exception that occurs when Armada times out while processing.'''
def __init__(self, reason):
self._message = 'Armada timed out waiting on: %s' % (reason)
super(ArmadaTimeoutException, self).__init__(self._message)
class KnownReleasesException(ArmadaException):
'''
Exception that occurs when no known releases are found.

View File

@ -18,7 +18,13 @@ from armada.exceptions.base_exception import ArmadaBaseException as ex
class KubernetesException(ex):
'''Base class for Kubernetes exceptions and error handling.'''
message = 'An unknown Kubernetes error occured.'
message = 'An unknown Kubernetes error occurred.'
class KubernetesWatchTimeoutException(KubernetesException):
'''Exception for timing out during a watch on a Kubernetes object'''
message = 'Kubernetes Watch has timed out.'
class KubernetesUnknownStreamingEventTypeException(KubernetesException):

View File

@ -13,6 +13,7 @@
# limitations under the License.
import difflib
import time
import yaml
from oslo_config import cfg
@ -22,18 +23,23 @@ from armada.handlers.chartbuilder import ChartBuilder
from armada.handlers.manifest import Manifest
from armada.handlers.override import Override
from armada.handlers.tiller import Tiller
from armada.exceptions import armada_exceptions
from armada.exceptions.armada_exceptions import ArmadaTimeoutException
from armada.exceptions import source_exceptions
from armada.exceptions import validate_exceptions
from armada.exceptions import tiller_exceptions
from armada.utils.release import release_prefix
from armada.utils import source
from armada.utils import validate
from armada import const
from armada.const import DEFAULT_CHART_TIMEOUT
from armada.const import KEYWORD_ARMADA
from armada.const import KEYWORD_CHARTS
from armada.const import KEYWORD_GROUPS
from armada.const import KEYWORD_PREFIX
from armada.const import STATUS_FAILED
LOG = logging.getLogger(__name__)
CONF = cfg.CONF
DEFAULT_TIMEOUT = 3600
class Armada(object):
@ -49,8 +55,8 @@ class Armada(object):
enable_chart_cleanup=False,
dry_run=False,
set_ovr=None,
tiller_should_wait=False,
tiller_timeout=DEFAULT_TIMEOUT,
force_wait=False,
timeout=0,
tiller_host=None,
tiller_port=None,
tiller_namespace=None,
@ -67,10 +73,10 @@ class Armada(object):
operations.
:param bool enable_chart_cleanup: Clean up unmanaged charts.
:param bool dry_run: Run charts without installing them.
:param bool tiller_should_wait: Specifies whether Tiller should wait
until all charts are deployed.
:param int tiller_timeout: Specifies time Tiller should wait for charts
to deploy until timing out.
:param bool force_wait: Force Tiller to wait until all charts are
deployed, rather than using each chart's specified wait policy.
:param int timeout: Specifies overall time in seconds that Tiller
should wait for charts until timing out.
:param str tiller_host: Tiller host IP. Default is None.
:param int tiller_port: Tiller host port. Default is
``CONF.tiller_port``.
@ -90,8 +96,8 @@ class Armada(object):
self.disable_update_post = disable_update_post
self.enable_chart_cleanup = enable_chart_cleanup
self.dry_run = dry_run
self.tiller_should_wait = tiller_should_wait
self.tiller_timeout = tiller_timeout
self.force_wait = force_wait
self.timeout = timeout
self.tiller = Tiller(
tiller_host=tiller_host, tiller_port=tiller_port,
tiller_namespace=tiller_namespace)
@ -140,13 +146,13 @@ class Armada(object):
details=','.join([m.get('message') for m in msg_list]))
# Purge known releases that have failed and are in the current yaml
armada_data = self.manifest.get(const.KEYWORD_ARMADA, {})
prefix = armada_data.get(const.KEYWORD_PREFIX, '')
failed_releases = self.get_releases_by_status(const.STATUS_FAILED)
manifest_data = self.manifest.get(KEYWORD_ARMADA, {})
prefix = manifest_data.get(KEYWORD_PREFIX, '')
failed_releases = self.get_releases_by_status(STATUS_FAILED)
for release in failed_releases:
for group in armada_data.get(const.KEYWORD_GROUPS, []):
for ch in group.get(const.KEYWORD_CHARTS, []):
for group in manifest_data.get(KEYWORD_GROUPS, []):
for ch in group.get(KEYWORD_CHARTS, []):
ch_release_name = release_prefix(
prefix, ch.get('chart', {}).get('chart_name'))
if release[0] == ch_release_name:
@ -159,8 +165,8 @@ class Armada(object):
# We only support a git source type right now, which can also
# handle git:// local paths as well
repos = {}
for group in armada_data.get(const.KEYWORD_GROUPS, []):
for ch in group.get(const.KEYWORD_CHARTS, []):
for group in manifest_data.get(KEYWORD_GROUPS, []):
for ch in group.get(KEYWORD_CHARTS, []):
self.tag_cloned_repo(ch, repos)
for dep in ch.get('chart', {}).get('dependencies', []):
@ -241,79 +247,82 @@ class Armada(object):
# extract known charts on tiller right now
known_releases = self.tiller.list_charts()
armada_data = self.manifest.get(const.KEYWORD_ARMADA, {})
prefix = armada_data.get(const.KEYWORD_PREFIX, '')
manifest_data = self.manifest.get(KEYWORD_ARMADA, {})
prefix = manifest_data.get(KEYWORD_PREFIX, '')
# TODO(fmontei): This is a useless exception that is probably never
# thrown as `known_releases` is a list and the proper behavior here
# should be to return early. Fix this once all side effects of
# correcting this are well understood.
if known_releases is None:
raise armada_exceptions.KnownReleasesException()
for chartgroup in manifest_data.get(KEYWORD_GROUPS, []):
cg_name = chartgroup.get('name', '<missing name>')
cg_desc = chartgroup.get('description', '<missing description>')
LOG.info('Processing ChartGroup: %s (%s)', cg_name, cg_desc)
for release in known_releases:
LOG.debug("Release %s, Version %s found on Tiller", release[0],
release[1])
cg_sequenced = chartgroup.get('sequenced', False)
cg_test_all_charts = chartgroup.get('test_charts', False)
for group in armada_data.get(const.KEYWORD_GROUPS, []):
tiller_should_wait = self.tiller_should_wait
tiller_timeout = self.tiller_timeout
desc = group.get('description', 'A Chart Group')
charts = group.get(const.KEYWORD_CHARTS, [])
test_charts = group.get('test_charts', False)
namespaces_seen = set()
tests_to_run = []
if group.get('sequenced', False) or test_charts:
tiller_should_wait = True
cg_charts = chartgroup.get(KEYWORD_CHARTS, [])
LOG.info('Deploying: %s', desc)
# Track largest Chart timeout to stop the ChartGroup at the end
cg_max_timeout = 0
for chart in charts:
chart = chart.get('chart', {})
for chart_entry in cg_charts:
chart = chart_entry.get('chart', {})
namespace = chart.get('namespace')
namespaces_seen.add(namespace)
release = chart.get('release')
values = chart.get('values', {})
test_chart = chart.get('test', False)
namespace = chart.get('namespace', None)
release = chart.get('release', None)
pre_actions = {}
post_actions = {}
if release is None:
continue
release_name = release_prefix(prefix, release)
if test_chart is True:
tiller_should_wait = True
# Retrieve appropriate timeout value
wait_timeout = self.timeout
if wait_timeout <= 0:
# TODO(MarshM): chart's `data.timeout` should be deprecated
chart_timeout = chart.get('timeout', 0)
# Favor data.wait.timeout over data.timeout, until removed
wait_values = chart.get('wait', {})
wait_timeout = wait_values.get('timeout', chart_timeout)
wait_labels = wait_values.get('labels', {})
# retrieve appropriate timeout value
# TODO(MarshM): chart's `data.timeout` should be deprecated
# to favor `data.wait.timeout`
# TODO(MarshM) also: timeout logic seems to prefer chart values
# over api/cli, probably should swap?
# (caution: it always default to 3600,
# take care to differentiate user input)
if tiller_should_wait and tiller_timeout == DEFAULT_TIMEOUT:
tiller_timeout = chart.get('timeout', tiller_timeout)
wait_values = chart.get('wait', {})
wait_timeout = wait_values.get('timeout', tiller_timeout)
wait_values_labels = wait_values.get('labels', {})
this_chart_should_wait = (
cg_sequenced or self.force_wait or
wait_timeout > 0 or len(wait_labels) > 0)
if this_chart_should_wait and wait_timeout <= 0:
LOG.warn('No Chart timeout specified, using default: %ss',
DEFAULT_CHART_TIMEOUT)
wait_timeout = DEFAULT_CHART_TIMEOUT
# Naively take largest timeout to apply at end
# TODO(MarshM) better handling of timeout/timer
cg_max_timeout = max(wait_timeout, cg_max_timeout)
# Chart test policy can override ChartGroup, if specified
test_this_chart = chart.get('test', cg_test_all_charts)
chartbuilder = ChartBuilder(chart)
protoc_chart = chartbuilder.get_helm_chart()
# determine install or upgrade by examining known releases
LOG.debug("RELEASE: %s", release)
deployed_releases = [x[0] for x in known_releases]
prefix_chart = release_prefix(prefix, release)
# Begin Chart timeout deadline
deadline = time.time() + wait_timeout
# TODO(mark-burnett): It may be more robust to directly call
# tiller status to decide whether to install/upgrade rather
# than checking for list membership.
if prefix_chart in deployed_releases:
if release_name in deployed_releases:
# indicate to the end user what path we are taking
LOG.info("Upgrading release %s", release)
LOG.info("Upgrading release %s in namespace %s",
release_name, namespace)
# extract the installed chart and installed values from the
# latest release so we can compare to the intended state
apply_chart, apply_values = self.find_release_chart(
known_releases, prefix_chart)
known_releases, release_name)
upgrade = chart.get('upgrade', {})
disable_hooks = upgrade.get('no_hooks', False)
@ -329,7 +338,7 @@ class Armada(object):
if not self.disable_update_post and upgrade_post:
post_actions = upgrade_post
# show delta for both the chart templates and the chart
# Show delta for both the chart templates and the chart
# values
# TODO(alanmeadows) account for .files differences
# once we support those
@ -342,80 +351,113 @@ class Armada(object):
LOG.info("There are no updates found in this chart")
continue
# TODO(MarshM): Add tiller dry-run before upgrade and
# consider deadline impacts
# do actual update
LOG.info('Beginning Upgrade, wait: %s, %s',
tiller_should_wait, wait_timeout)
self.tiller.update_release(
timer = int(round(deadline - time.time()))
LOG.info('Beginning Upgrade, wait=%s, timeout=%ss',
this_chart_should_wait, timer)
tiller_result = self.tiller.update_release(
protoc_chart,
prefix_chart,
release_name,
namespace,
pre_actions=pre_actions,
post_actions=post_actions,
dry_run=self.dry_run,
disable_hooks=disable_hooks,
values=yaml.safe_dump(values),
wait=tiller_should_wait,
timeout=wait_timeout)
wait=this_chart_should_wait,
timeout=timer)
if tiller_should_wait:
if this_chart_should_wait:
self.tiller.k8s.wait_until_ready(
release=prefix_chart,
labels=wait_values_labels,
release=release_name,
labels=wait_labels,
namespace=namespace,
k8s_wait_attempts=self.k8s_wait_attempts,
k8s_wait_attempt_sleep=self.k8s_wait_attempt_sleep,
timeout=wait_timeout
timeout=timer
)
msg['upgrade'].append(prefix_chart)
LOG.info('Upgrade completed with results from Tiller: %s',
tiller_result.__dict__)
msg['upgrade'].append(release_name)
# process install
else:
LOG.info("Installing release %s", release)
LOG.info('Beginning Install, wait: %s, %s',
tiller_should_wait, wait_timeout)
self.tiller.install_release(
LOG.info("Installing release %s in namespace %s",
release_name, namespace)
timer = int(round(deadline - time.time()))
LOG.info('Beginning Install, wait=%s, timeout=%ss',
this_chart_should_wait, timer)
tiller_result = self.tiller.install_release(
protoc_chart,
prefix_chart,
release_name,
namespace,
dry_run=self.dry_run,
values=yaml.safe_dump(values),
wait=tiller_should_wait,
timeout=wait_timeout)
wait=this_chart_should_wait,
timeout=timer)
if tiller_should_wait:
if this_chart_should_wait:
self.tiller.k8s.wait_until_ready(
release=prefix_chart,
labels=wait_values_labels,
release=release_name,
labels=wait_labels,
namespace=namespace,
k8s_wait_attempts=self.k8s_wait_attempts,
k8s_wait_attempt_sleep=self.k8s_wait_attempt_sleep,
timeout=wait_timeout
timeout=timer
)
msg['install'].append(prefix_chart)
LOG.info('Install completed with results from Tiller: %s',
tiller_result.__dict__)
msg['install'].append(release_name)
LOG.debug("Cleaning up chart source in %s",
chartbuilder.source_directory)
# Sequenced ChartGroup should run tests after each Chart
timer = int(round(deadline - time.time()))
if test_this_chart and cg_sequenced:
LOG.info('Running sequenced test, timeout remaining: %ss.',
timer)
if timer <= 0:
reason = ('Timeout expired before testing sequenced '
'release %s' % release_name)
LOG.error(reason)
raise ArmadaTimeoutException(reason)
self._test_chart(release_name, timer)
if test_charts or (test_chart is True):
LOG.info('Testing: %s', prefix_chart)
resp = self.tiller.testing_release(prefix_chart)
test_status = getattr(resp.info.status,
'last_test_suite_run', 'FAILED')
LOG.info("Test INFO: %s", test_status)
if resp:
LOG.info("PASSED: %s", prefix_chart)
else:
LOG.info("FAILED: %s", prefix_chart)
# Un-sequenced ChartGroup should run tests at the end
elif test_this_chart:
# Keeping track of time remaining
tests_to_run.append((release_name, timer))
# TODO(MarshM) does this need release/labels/namespace?
# TODO(MarshM) consider the tiller_timeout according to above logic
LOG.info('Wait after Chartgroup (%s) %ssec', desc, tiller_timeout)
self.tiller.k8s.wait_until_ready(
k8s_wait_attempts=self.k8s_wait_attempts,
k8s_wait_attempt_sleep=self.k8s_wait_attempt_sleep,
timeout=tiller_timeout)
# End of Charts in ChartGroup
LOG.info('All Charts applied.')
# After all Charts are applied, we should wait for the entire
# ChartGroup to become healthy by looking at the namespaces seen
# TODO(MarshM): Need to restrict to only charts we processed
# TODO(MarshM): Need to determine a better timeout
deadline = time.time() + cg_max_timeout
for ns in namespaces_seen:
timer = int(round(deadline - time.time()))
LOG.info('Final wait for healthy namespace (%s), '
'timeout remaining: %ss.', ns, timer)
if timer <= 0:
reason = 'Timeout expired waiting on namespace: %s' % (ns)
LOG.error(reason)
raise ArmadaTimeoutException(reason)
self.tiller.k8s.wait_until_ready(
namespace=ns,
k8s_wait_attempts=self.k8s_wait_attempts,
k8s_wait_attempt_sleep=self.k8s_wait_attempt_sleep,
timeout=timer)
# After entire ChartGroup is healthy, run any pending tests
for (test, test_timer) in tests_to_run:
self._test_chart(test, test_timer)
LOG.info("Performing Post-Flight Operations")
self.post_flight_ops()
@ -423,7 +465,7 @@ class Armada(object):
if self.enable_chart_cleanup:
self.tiller.chart_cleanup(
prefix,
self.manifest[const.KEYWORD_ARMADA][const.KEYWORD_GROUPS])
self.manifest[KEYWORD_ARMADA][KEYWORD_GROUPS])
return msg
@ -432,53 +474,65 @@ class Armada(object):
Operations to run after deployment process has terminated
'''
# Delete temp dirs used for deployment
for group in self.manifest.get(const.KEYWORD_ARMADA, {}).get(
const.KEYWORD_GROUPS, []):
for ch in group.get(const.KEYWORD_CHARTS, []):
for group in self.manifest.get(KEYWORD_ARMADA, {}).get(
KEYWORD_GROUPS, []):
for ch in group.get(KEYWORD_CHARTS, []):
chart = ch.get('chart', {})
if chart.get('source', {}).get('type') == 'git':
source_dir = chart.get('source_dir')
if isinstance(source_dir, tuple) and source_dir:
source.source_cleanup(source_dir[0])
def _test_chart(self, release_name, timeout):
# TODO(MarshM): Fix testing, it's broken, and track timeout
resp = self.tiller.testing_release(release_name, timeout=timeout)
status = getattr(resp.info.status, 'last_test_suite_run', 'FAILED')
LOG.info("Test INFO: %s", status)
if resp:
LOG.info("PASSED: %s", release_name)
return True
else:
LOG.info("FAILED: %s", release_name)
return False
def show_diff(self, chart, installed_chart, installed_values, target_chart,
target_values, msg):
'''
Produce a unified diff of the installed chart vs our intention
'''Produce a unified diff of the installed chart vs our intention'''
TODO(alanmeadows): This needs to be rewritten to produce better
unified diff output
'''
# TODO(MarshM) This gives decent output comparing values. Would be
# nice to clean it up further. Are \\n or \n\n ever valid diffs?
# Can these be cleanly converted to dicts, for easier compare?
def _sanitize_diff_str(str):
return str.replace('\\n', '\n').replace('\n\n', '\n').split('\n')
source = str(installed_chart.SerializeToString()).split('\n')
chart_diff = list(
difflib.unified_diff(source, str(target_chart).split('\n')))
source = _sanitize_diff_str(str(installed_chart.SerializeToString()))
target = _sanitize_diff_str(str(target_chart))
chart_diff = list(difflib.unified_diff(source, target, n=0))
chart_release = chart.get('release', None)
if len(chart_diff) > 0:
LOG.info("Found diff in Chart (%s)", chart_release)
diff_msg = []
for line in chart_diff:
diff_msg.append(line)
msg['diff'].append({'chart': diff_msg})
pretty_diff = '\n'.join(diff_msg).replace(
'\\n', '\n').replace('\n\n', '\n')
LOG.info("Found diff in chart (%s)", chart_release)
pretty_diff = '\n'.join(diff_msg)
LOG.debug(pretty_diff)
values_diff = list(
difflib.unified_diff(
installed_values.split('\n'),
yaml.safe_dump(target_values).split('\n')))
source = _sanitize_diff_str(installed_values)
target = _sanitize_diff_str(yaml.safe_dump(target_values))
values_diff = list(difflib.unified_diff(source, target, n=0))
if len(values_diff) > 0:
LOG.info("Found diff in values (%s)", chart_release)
diff_msg = []
for line in values_diff:
diff_msg.append(line)
msg['diff'].append({'values': diff_msg})
pretty_diff = '\n'.join(diff_msg).replace(
'\\n', '\n').replace('\n\n', '\n')
LOG.info("Found diff in chart values (%s)", chart_release)
pretty_diff = '\n'.join(diff_msg)
LOG.debug(pretty_diff)
result = (len(chart_diff) > 0) or (len(values_diff) > 0)

View File

@ -22,6 +22,7 @@ from kubernetes.client.rest import ApiException
from oslo_config import cfg
from oslo_log import log as logging
from armada.const import DEFAULT_K8S_TIMEOUT
from armada.utils.release import label_selectors
from armada.exceptions import k8s_exceptions as exceptions
@ -48,7 +49,8 @@ class K8s(object):
self.extension_api = client.ExtensionsV1beta1Api()
def delete_job_action(self, name, namespace="default",
propagation_policy='Foreground'):
propagation_policy='Foreground',
timeout=DEFAULT_K8S_TIMEOUT):
'''
:params name - name of the job
:params namespace - name of pod that job
@ -58,13 +60,36 @@ class K8s(object):
before the Job is marked as deleted.
'''
try:
LOG.debug('Deleting job %s, Wait timeout=%s', name, timeout)
body = client.V1DeleteOptions()
self.batch_api.delete_namespaced_job(
name=name, namespace=namespace, body=body,
propagation_policy=propagation_policy)
w = watch.Watch()
issue_delete = True
for event in w.stream(self.batch_api.list_namespaced_job,
namespace=namespace,
timeout_seconds=timeout):
if issue_delete:
self.batch_api.delete_namespaced_job(
name=name, namespace=namespace, body=body,
propagation_policy=propagation_policy)
issue_delete = False
event_type = event['type'].upper()
job_name = event['object'].metadata.name
if event_type == 'DELETED' and job_name == name:
LOG.debug('Successfully deleted job %s', job_name)
return
err_msg = ('Reached timeout while waiting to delete job: '
'name=%s, namespace=%s' % (name, namespace))
LOG.error(err_msg)
raise exceptions.KubernetesWatchTimeoutException(err_msg)
except ApiException as e:
LOG.error("Exception when deleting job: name=%s, namespace=%s: %s",
name, namespace, e)
LOG.exception(
"Exception when deleting job: name=%s, namespace=%s",
name, namespace)
raise e
def get_namespace_job(self, namespace="default", label_selector=''):
'''
@ -188,7 +213,8 @@ class K8s(object):
LOG.info('New pod %s deployed', new_pod_name)
w.stop()
def wait_get_completed_podphase(self, release, timeout=300):
def wait_get_completed_podphase(self, release,
timeout=DEFAULT_K8S_TIMEOUT):
'''
:param release - part of namespace
:param timeout - time before disconnecting stream
@ -207,9 +233,9 @@ class K8s(object):
def wait_until_ready(self,
release=None,
namespace='default',
namespace='',
labels='',
timeout=300,
timeout=DEFAULT_K8S_TIMEOUT,
k8s_wait_attempts=1,
k8s_wait_attempt_sleep=1):
'''
@ -232,10 +258,19 @@ class K8s(object):
sleep_time = (k8s_wait_attempt_sleep if k8s_wait_attempt_sleep >= 1
else 1)
LOG.debug("Wait on %s (%s) for %s sec (k8s wait %s times, sleep %ss)",
LOG.debug("Wait on namespace=(%s) labels=(%s) for %s sec "
"(k8s wait %s times, sleep %ss)",
namespace, label_selector, timeout,
wait_attempts, sleep_time)
if not namespace:
# This shouldn't be reachable
LOG.warn('"namespace" not specified, waiting across all available '
'namespaces is likely to cause unintended consequences.')
if not label_selector:
LOG.warn('"label_selector" not specified, waiting with no labels '
'may cause unintended consequences.')
deadline = time.time() + timeout
# NOTE(mark-burnett): Attempt to wait multiple times without
@ -243,7 +278,7 @@ class K8s(object):
successes = 0
while successes < wait_attempts:
deadline_remaining = int(deadline - time.time())
deadline_remaining = int(round(deadline - time.time()))
if deadline_remaining <= 0:
return False
timed_out, modified_pods, unready_pods = self._wait_one_time(
@ -253,6 +288,9 @@ class K8s(object):
if timed_out:
LOG.info('Timed out waiting for pods: %s',
sorted(unready_pods))
raise exceptions.KubernetesWatchTimeoutException(
'Timed out while waiting on namespace=(%s) labels=(%s)' %
(namespace, label_selector))
return False
if modified_pods:
@ -276,17 +314,23 @@ class K8s(object):
w = watch.Watch()
first_event = True
# TODO(MarshM) still need to filter pods on namespace
for event in w.stream(self.client.list_pod_for_all_namespaces,
label_selector=label_selector,
timeout_seconds=timeout):
# Watch across specific namespace, or all
kwargs = {
'label_selector': label_selector,
'timeout_seconds': timeout,
}
if namespace:
func_to_call = self.client.list_namespaced_pod
kwargs['namespace'] = namespace
else:
func_to_call = self.client.list_pod_for_all_namespaces
for event in w.stream(func_to_call, **kwargs):
if first_event:
pod_list = self.client.list_pod_for_all_namespaces(
label_selector=label_selector,
timeout_seconds=timeout)
pod_list = func_to_call(**kwargs)
for pod in pod_list.items:
LOG.debug('Setting up to wait for pod %s',
pod.metadata.name)
LOG.debug('Setting up to wait for pod %s namespace=%s',
pod.metadata.name, pod.metadata.namespace)
ready_pods[pod.metadata.name] = False
first_event = False

View File

@ -51,6 +51,16 @@ CONF = cfg.CONF
LOG = logging.getLogger(__name__)
class TillerResult(object):
'''Object to hold Tiller results for Armada.'''
def __init__(self, release, namespace, status, description, version):
self.release = release
self.namespace = namespace
self.status = status
self.description = description
self.version = version
class Tiller(object):
'''
The Tiller class supports communication and requests to the Tiller Helm
@ -164,10 +174,14 @@ class Tiller(object):
'''
List Helm Releases
'''
# TODO(MarshM possibly combine list_releases() with list_charts()
# since they do the same thing, grouping output differently
releases = []
stub = ReleaseServiceStub(self.channel)
# TODO(mark-burnett): Since we're limiting page size, we need to
# iterate through all the pages when collecting this list.
# NOTE(MarshM): `Helm List` defaults to returning Deployed and Failed,
# but this might not be a desireable ListReleasesRequest default.
req = ListReleasesRequest(limit=RELEASE_LIMIT,
status_codes=[STATUS_DEPLOYED,
STATUS_FAILED],
@ -179,6 +193,7 @@ class Tiller(object):
metadata=self.metadata)
for y in release_list:
# TODO(MarshM) this log is too noisy, fix later
# LOG.debug('Found release: %s', y.releases
releases.extend(y.releases)
@ -210,7 +225,7 @@ class Tiller(object):
return template
def _pre_update_actions(self, actions, release_name, namespace, chart,
disable_hooks, values):
disable_hooks, values, timeout):
'''
:params actions - array of items actions
:params namespace - name of pod for actions
@ -225,9 +240,10 @@ class Tiller(object):
self.rolling_upgrade_pod_deployment(
name, release_name, namespace, labels,
action_type, chart, disable_hooks, values)
action_type, chart, disable_hooks, values, timeout)
except Exception:
LOG.warn("Pre: Could not update anything, please check yaml")
raise ex.PreUpdateJobDeleteException(name, namespace)
try:
for action in actions.get('delete', []):
@ -235,8 +251,8 @@ class Tiller(object):
action_type = action.get('type')
labels = action.get('labels', None)
self.delete_resources(
release_name, name, action_type, labels, namespace)
self.delete_resources(release_name, name, action_type,
labels, namespace, timeout)
except Exception:
LOG.warn("PRE: Could not delete anything, please check yaml")
raise ex.PreUpdateJobDeleteException(name, namespace)
@ -275,17 +291,22 @@ class Tiller(object):
Returns a list of tuples in the form:
(name, version, chart, values, status)
'''
LOG.debug('Getting known releases from Tiller...')
charts = []
for latest_release in self.list_releases():
try:
charts.append(
(latest_release.name, latest_release.version,
latest_release.chart, latest_release.config.raw,
latest_release.info.status.Code.Name(
latest_release.info.status.code)))
except IndexError:
release = (
latest_release.name, latest_release.version,
latest_release.chart, latest_release.config.raw,
latest_release.info.status.Code.Name(
latest_release.info.status.code))
charts.append(release)
LOG.debug('Found release %s, version %s, status: %s',
release[0], release[1], release[4])
except (AttributeError, IndexError) as e:
LOG.debug('%s while getting releases: %s, ex=%s',
e.__class__.__name__, latest_release, e)
continue
# LOG.debug('List of Helm Charts from Latest Releases: %s', charts)
return charts
def update_release(self, chart, release, namespace,
@ -312,7 +333,7 @@ class Tiller(object):
values = Config(raw=values)
self._pre_update_actions(pre_actions, release, namespace, chart,
disable_hooks, values)
disable_hooks, values, timeout)
# build release install request
try:
@ -329,7 +350,16 @@ class Tiller(object):
update_msg = stub.UpdateRelease(
release_request, rel_timeout + GRPC_EPSILON,
metadata=self.metadata)
return update_msg
tiller_result = TillerResult(
update_msg.release.name,
update_msg.release.namespace,
update_msg.release.info.status.Code.Name(
update_msg.release.info.status.code),
update_msg.release.info.Description,
update_msg.release.version)
return tiller_result
except Exception:
LOG.exception('Error while updating release %s', release)
status = self.get_release_status(release)
@ -372,7 +402,16 @@ class Tiller(object):
install_msg = stub.InstallRelease(
release_request, rel_timeout + GRPC_EPSILON,
metadata=self.metadata)
return install_msg
tiller_result = TillerResult(
install_msg.release.name,
install_msg.release.namespace,
install_msg.release.info.status.Code.Name(
install_msg.release.info.status.code),
install_msg.release.info.Description,
install_msg.release.version)
return tiller_result
except Exception:
LOG.exception('Error while installing release %s', release)
status = self.get_release_status(release)
@ -527,7 +566,8 @@ class Tiller(object):
self.uninstall_release(chart)
def delete_resources(self, release_name, resource_name, resource_type,
resource_labels, namespace, wait=False):
resource_labels, namespace, wait=False,
timeout=TILLER_TIMEOUT):
'''
:params release_name - release name the specified resource is under
:params resource_name - name of specific resource
@ -541,16 +581,16 @@ class Tiller(object):
label_selector = ''
if resource_labels is not None:
label_selector = label_selectors(resource_labels)
LOG.debug("Deleting resources in namespace %s matching"
LOG.debug("Deleting resources in namespace %s matching "
"selectors %s.", namespace, label_selector)
if 'job' in resource_type:
get_jobs = self.k8s.get_namespace_job(namespace, label_selector)
for jb in get_jobs.items:
jb_name = jb.metadata.name
LOG.info("Deleting %s in namespace: %s", jb_name, namespace)
self.k8s.delete_job_action(jb_name, namespace)
LOG.info("Deleting job %s in namespace: %s",
jb_name, namespace)
self.k8s.delete_job_action(jb_name, namespace, timeout=timeout)
elif 'pod' in resource_type:
release_pods = self.k8s.get_namespace_pod(
@ -558,7 +598,8 @@ class Tiller(object):
for pod in release_pods.items:
pod_name = pod.metadata.name
LOG.info("Deleting %s in namespace: %s", pod_name, namespace)
LOG.info("Deleting pod %s in namespace: %s",
pod_name, namespace)
self.k8s.delete_namespace_pod(pod_name, namespace)
if wait:
self.k8s.wait_for_pod_redeployment(pod_name, namespace)
@ -568,7 +609,8 @@ class Tiller(object):
def rolling_upgrade_pod_deployment(self, name, release_name, namespace,
resource_labels, action_type, chart,
disable_hooks, values):
disable_hooks, values,
timeout=TILLER_TIMEOUT):
'''
update statefullsets (daemon, stateful)
'''
@ -607,7 +649,7 @@ class Tiller(object):
# delete pods
self.delete_resources(
release_name, name, 'pod', resource_labels, namespace,
wait=True)
wait=True, timeout=timeout)
else:
LOG.error("Unable to exectue name: % type: %s", name, action_type)

View File

@ -48,8 +48,8 @@ class ArmadaControllerTest(base.BaseControllerTest):
'disable_update_post': False,
'enable_chart_cleanup': False,
'dry_run': False,
'tiller_should_wait': False,
'tiller_timeout': 100,
'force_wait': False,
'timeout': 100,
'tiller_host': None,
'tiller_port': 44134,
'tiller_namespace': 'kube-system',

View File

@ -55,7 +55,8 @@ data:
location: /tmp/dummy/armada
subpath: chart_2
dependencies: []
timeout: 5
wait:
timeout: 10
---
schema: armada/Chart/v1
metadata:
@ -72,7 +73,8 @@ data:
subpath: chart_1
reference: master
dependencies: []
timeout: 50
wait:
timeout: 10
"""
CHART_SOURCES = [('git://github.com/dummy/armada', 'chart_1'),
@ -104,8 +106,10 @@ class ArmadaHandlerTestCase(base.ArmadaTestCase):
'type': 'git'
},
'source_dir': CHART_SOURCES[0],
'timeout': 50,
'values': {}
'values': {},
'wait': {
'timeout': 10
}
}
},
{
@ -120,8 +124,10 @@ class ArmadaHandlerTestCase(base.ArmadaTestCase):
'type': 'local'
},
'source_dir': CHART_SOURCES[1],
'timeout': 5,
'values': {}
'values': {},
'wait': {
'timeout': 10
}
}
}
],
@ -212,8 +218,8 @@ class ArmadaHandlerTestCase(base.ArmadaTestCase):
chart_1['namespace'],
dry_run=armada_obj.dry_run,
values=yaml.safe_dump(chart_1['values']),
wait=armada_obj.tiller_should_wait,
timeout=armada_obj.tiller_timeout),
timeout=10,
wait=True),
mock.call(
mock_chartbuilder().get_helm_chart(),
"{}-{}".format(armada_obj.manifest['armada']['release_prefix'],
@ -221,7 +227,7 @@ class ArmadaHandlerTestCase(base.ArmadaTestCase):
chart_2['namespace'],
dry_run=armada_obj.dry_run,
values=yaml.safe_dump(chart_2['values']),
wait=armada_obj.tiller_should_wait,
timeout=armada_obj.tiller_timeout)
timeout=10,
wait=True)
]
mock_tiller.return_value.install_release.assert_has_calls(method_calls)

View File

@ -18,6 +18,14 @@
:widths: 5 50
:header-rows: 1
* - Exception Name
- Description
* - ArmadaTimeoutException
- .. autoexception:: armada.exceptions.armada_exceptions.ArmadaTimeoutException
:members:
:show-inheritance:
:undoc-members:
* - Exception Name
- Description
* - KnownReleasesException

View File

@ -31,3 +31,8 @@
:members:
:show-inheritance:
:undoc-members:
* - KubernetesWatchTimeoutException
- .. autoexception:: armada.exceptions.k8s_exceptions.KubernetesWatchTimeoutException
:members:
:show-inheritance:
:undoc-members: