fix(timeouts): Address timeout handling issues

- fixing wait handling in multiple areas
      -- wait for deleted pods before continuing Apply update
      -- cleaning up and delineating wait for charts vs chartgroups
      -- timeout exceptions to stop execution
    - api/cli 'timeout' param now applies to all Charts
    - api/cli 'wait' param now applies to all Charts
    - update some docs
    - several TODOs to be addressed in future PS

Closes #199

Change-Id: I5a697508ce6027e9182f3f1f61757319a3ed3593
This commit is contained in:
Marshall Margenau 2018-04-19 11:05:59 -05:00
parent b5d5c2dfd5
commit dc508d5012
12 changed files with 373 additions and 194 deletions

View File

@ -80,8 +80,8 @@ class Apply(api.BaseResource):
enable_chart_cleanup=req.get_param_as_bool( enable_chart_cleanup=req.get_param_as_bool(
'enable_chart_cleanup'), 'enable_chart_cleanup'),
dry_run=req.get_param_as_bool('dry_run'), dry_run=req.get_param_as_bool('dry_run'),
tiller_should_wait=req.get_param_as_bool('wait'), force_wait=req.get_param_as_bool('wait'),
tiller_timeout=req.get_param_as_int('timeout') or 3600, timeout=req.get_param_as_int('timeout') or 0,
tiller_host=req.get_param('tiller_host'), tiller_host=req.get_param('tiller_host'),
tiller_port=req.get_param_as_int( tiller_port=req.get_param_as_int(
'tiller_port') or CONF.tiller_port, 'tiller_port') or CONF.tiller_port,

View File

@ -106,9 +106,10 @@ SHORT_DESC = "Command installs manifest charts."
type=str, type=str,
default=CONF.tiller_namespace) default=CONF.tiller_namespace)
@click.option('--timeout', @click.option('--timeout',
help="Specifies time to wait for charts to deploy.", help="Specifies time to wait for each chart to fully "
"finish deploying.",
type=int, type=int,
default=3600) default=0)
@click.option('--values', '-f', @click.option('--values', '-f',
help=("Use to override multiple Armada Manifest values by " help=("Use to override multiple Armada Manifest values by "
"reading overrides from a values.yaml-type file."), "reading overrides from a values.yaml-type file."),
@ -116,7 +117,9 @@ SHORT_DESC = "Command installs manifest charts."
type=str, type=str,
default=[]) default=[])
@click.option('--wait', @click.option('--wait',
help="Wait until all charts deployed.", help=("Force Tiller to wait until all charts are deployed, "
"rather than using each chart's specified wait policy. "
"This is equivalent to sequenced chartgroups."),
is_flag=True) is_flag=True)
@click.option('--target-manifest', @click.option('--target-manifest',
help=("The target manifest to run. Required for specifying " help=("The target manifest to run. Required for specifying "
@ -207,8 +210,8 @@ class ApplyManifest(CliAction):
enable_chart_cleanup=self.enable_chart_cleanup, enable_chart_cleanup=self.enable_chart_cleanup,
dry_run=self.dry_run, dry_run=self.dry_run,
set_ovr=self.set, set_ovr=self.set,
tiller_should_wait=self.wait, force_wait=self.wait,
tiller_timeout=self.timeout, timeout=self.timeout,
tiller_host=self.tiller_host, tiller_host=self.tiller_host,
tiller_port=self.tiller_port, tiller_port=self.tiller_port,
tiller_namespace=self.tiller_namespace, tiller_namespace=self.tiller_namespace,

View File

@ -23,11 +23,14 @@ KEYWORD_PREFIX = 'release_prefix'
KEYWORD_GROUPS = 'chart_groups' KEYWORD_GROUPS = 'chart_groups'
KEYWORD_CHARTS = 'chart_group' KEYWORD_CHARTS = 'chart_group'
KEYWORD_RELEASE = 'release' KEYWORD_RELEASE = 'release'
KEYWORD_CHART = 'chart'
# Statuses # Tiller
DEFAULT_CHART_TIMEOUT = 3600
STATUS_DEPLOYED = 'DEPLOYED' STATUS_DEPLOYED = 'DEPLOYED'
STATUS_FAILED = 'FAILED' STATUS_FAILED = 'FAILED'
# Kubernetes
DEFAULT_K8S_TIMEOUT = 300
# Configuration File # Configuration File
CONFIG_PATH = '/etc/armada' CONFIG_PATH = '/etc/armada'

View File

@ -21,6 +21,14 @@ class ArmadaException(base_exception.ArmadaBaseException):
message = 'An unknown Armada handler error occurred.' message = 'An unknown Armada handler error occurred.'
class ArmadaTimeoutException(ArmadaException):
'''Exception that occurs when Armada times out while processing.'''
def __init__(self, reason):
self._message = 'Armada timed out waiting on: %s' % (reason)
super(ArmadaTimeoutException, self).__init__(self._message)
class KnownReleasesException(ArmadaException): class KnownReleasesException(ArmadaException):
''' '''
Exception that occurs when no known releases are found. Exception that occurs when no known releases are found.

View File

@ -18,7 +18,13 @@ from armada.exceptions.base_exception import ArmadaBaseException as ex
class KubernetesException(ex): class KubernetesException(ex):
'''Base class for Kubernetes exceptions and error handling.''' '''Base class for Kubernetes exceptions and error handling.'''
message = 'An unknown Kubernetes error occured.' message = 'An unknown Kubernetes error occurred.'
class KubernetesWatchTimeoutException(KubernetesException):
'''Exception for timing out during a watch on a Kubernetes object'''
message = 'Kubernetes Watch has timed out.'
class KubernetesUnknownStreamingEventTypeException(KubernetesException): class KubernetesUnknownStreamingEventTypeException(KubernetesException):

View File

@ -13,6 +13,7 @@
# limitations under the License. # limitations under the License.
import difflib import difflib
import time
import yaml import yaml
from oslo_config import cfg from oslo_config import cfg
@ -22,18 +23,23 @@ from armada.handlers.chartbuilder import ChartBuilder
from armada.handlers.manifest import Manifest from armada.handlers.manifest import Manifest
from armada.handlers.override import Override from armada.handlers.override import Override
from armada.handlers.tiller import Tiller from armada.handlers.tiller import Tiller
from armada.exceptions import armada_exceptions from armada.exceptions.armada_exceptions import ArmadaTimeoutException
from armada.exceptions import source_exceptions from armada.exceptions import source_exceptions
from armada.exceptions import validate_exceptions from armada.exceptions import validate_exceptions
from armada.exceptions import tiller_exceptions from armada.exceptions import tiller_exceptions
from armada.utils.release import release_prefix from armada.utils.release import release_prefix
from armada.utils import source from armada.utils import source
from armada.utils import validate from armada.utils import validate
from armada import const
from armada.const import DEFAULT_CHART_TIMEOUT
from armada.const import KEYWORD_ARMADA
from armada.const import KEYWORD_CHARTS
from armada.const import KEYWORD_GROUPS
from armada.const import KEYWORD_PREFIX
from armada.const import STATUS_FAILED
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
CONF = cfg.CONF CONF = cfg.CONF
DEFAULT_TIMEOUT = 3600
class Armada(object): class Armada(object):
@ -49,8 +55,8 @@ class Armada(object):
enable_chart_cleanup=False, enable_chart_cleanup=False,
dry_run=False, dry_run=False,
set_ovr=None, set_ovr=None,
tiller_should_wait=False, force_wait=False,
tiller_timeout=DEFAULT_TIMEOUT, timeout=0,
tiller_host=None, tiller_host=None,
tiller_port=None, tiller_port=None,
tiller_namespace=None, tiller_namespace=None,
@ -67,10 +73,10 @@ class Armada(object):
operations. operations.
:param bool enable_chart_cleanup: Clean up unmanaged charts. :param bool enable_chart_cleanup: Clean up unmanaged charts.
:param bool dry_run: Run charts without installing them. :param bool dry_run: Run charts without installing them.
:param bool tiller_should_wait: Specifies whether Tiller should wait :param bool force_wait: Force Tiller to wait until all charts are
until all charts are deployed. deployed, rather than using each chart's specified wait policy.
:param int tiller_timeout: Specifies time Tiller should wait for charts :param int timeout: Specifies overall time in seconds that Tiller
to deploy until timing out. should wait for charts until timing out.
:param str tiller_host: Tiller host IP. Default is None. :param str tiller_host: Tiller host IP. Default is None.
:param int tiller_port: Tiller host port. Default is :param int tiller_port: Tiller host port. Default is
``CONF.tiller_port``. ``CONF.tiller_port``.
@ -90,8 +96,8 @@ class Armada(object):
self.disable_update_post = disable_update_post self.disable_update_post = disable_update_post
self.enable_chart_cleanup = enable_chart_cleanup self.enable_chart_cleanup = enable_chart_cleanup
self.dry_run = dry_run self.dry_run = dry_run
self.tiller_should_wait = tiller_should_wait self.force_wait = force_wait
self.tiller_timeout = tiller_timeout self.timeout = timeout
self.tiller = Tiller( self.tiller = Tiller(
tiller_host=tiller_host, tiller_port=tiller_port, tiller_host=tiller_host, tiller_port=tiller_port,
tiller_namespace=tiller_namespace) tiller_namespace=tiller_namespace)
@ -140,13 +146,13 @@ class Armada(object):
details=','.join([m.get('message') for m in msg_list])) details=','.join([m.get('message') for m in msg_list]))
# Purge known releases that have failed and are in the current yaml # Purge known releases that have failed and are in the current yaml
armada_data = self.manifest.get(const.KEYWORD_ARMADA, {}) manifest_data = self.manifest.get(KEYWORD_ARMADA, {})
prefix = armada_data.get(const.KEYWORD_PREFIX, '') prefix = manifest_data.get(KEYWORD_PREFIX, '')
failed_releases = self.get_releases_by_status(const.STATUS_FAILED) failed_releases = self.get_releases_by_status(STATUS_FAILED)
for release in failed_releases: for release in failed_releases:
for group in armada_data.get(const.KEYWORD_GROUPS, []): for group in manifest_data.get(KEYWORD_GROUPS, []):
for ch in group.get(const.KEYWORD_CHARTS, []): for ch in group.get(KEYWORD_CHARTS, []):
ch_release_name = release_prefix( ch_release_name = release_prefix(
prefix, ch.get('chart', {}).get('chart_name')) prefix, ch.get('chart', {}).get('chart_name'))
if release[0] == ch_release_name: if release[0] == ch_release_name:
@ -159,8 +165,8 @@ class Armada(object):
# We only support a git source type right now, which can also # We only support a git source type right now, which can also
# handle git:// local paths as well # handle git:// local paths as well
repos = {} repos = {}
for group in armada_data.get(const.KEYWORD_GROUPS, []): for group in manifest_data.get(KEYWORD_GROUPS, []):
for ch in group.get(const.KEYWORD_CHARTS, []): for ch in group.get(KEYWORD_CHARTS, []):
self.tag_cloned_repo(ch, repos) self.tag_cloned_repo(ch, repos)
for dep in ch.get('chart', {}).get('dependencies', []): for dep in ch.get('chart', {}).get('dependencies', []):
@ -241,79 +247,82 @@ class Armada(object):
# extract known charts on tiller right now # extract known charts on tiller right now
known_releases = self.tiller.list_charts() known_releases = self.tiller.list_charts()
armada_data = self.manifest.get(const.KEYWORD_ARMADA, {}) manifest_data = self.manifest.get(KEYWORD_ARMADA, {})
prefix = armada_data.get(const.KEYWORD_PREFIX, '') prefix = manifest_data.get(KEYWORD_PREFIX, '')
# TODO(fmontei): This is a useless exception that is probably never for chartgroup in manifest_data.get(KEYWORD_GROUPS, []):
# thrown as `known_releases` is a list and the proper behavior here cg_name = chartgroup.get('name', '<missing name>')
# should be to return early. Fix this once all side effects of cg_desc = chartgroup.get('description', '<missing description>')
# correcting this are well understood. LOG.info('Processing ChartGroup: %s (%s)', cg_name, cg_desc)
if known_releases is None:
raise armada_exceptions.KnownReleasesException()
for release in known_releases: cg_sequenced = chartgroup.get('sequenced', False)
LOG.debug("Release %s, Version %s found on Tiller", release[0], cg_test_all_charts = chartgroup.get('test_charts', False)
release[1])
for group in armada_data.get(const.KEYWORD_GROUPS, []): namespaces_seen = set()
tiller_should_wait = self.tiller_should_wait tests_to_run = []
tiller_timeout = self.tiller_timeout
desc = group.get('description', 'A Chart Group')
charts = group.get(const.KEYWORD_CHARTS, [])
test_charts = group.get('test_charts', False)
if group.get('sequenced', False) or test_charts: cg_charts = chartgroup.get(KEYWORD_CHARTS, [])
tiller_should_wait = True
LOG.info('Deploying: %s', desc) # Track largest Chart timeout to stop the ChartGroup at the end
cg_max_timeout = 0
for chart in charts: for chart_entry in cg_charts:
chart = chart.get('chart', {}) chart = chart_entry.get('chart', {})
namespace = chart.get('namespace')
namespaces_seen.add(namespace)
release = chart.get('release')
values = chart.get('values', {}) values = chart.get('values', {})
test_chart = chart.get('test', False)
namespace = chart.get('namespace', None)
release = chart.get('release', None)
pre_actions = {} pre_actions = {}
post_actions = {} post_actions = {}
if release is None: release_name = release_prefix(prefix, release)
continue
if test_chart is True: # Retrieve appropriate timeout value
tiller_should_wait = True wait_timeout = self.timeout
if wait_timeout <= 0:
# TODO(MarshM): chart's `data.timeout` should be deprecated
chart_timeout = chart.get('timeout', 0)
# Favor data.wait.timeout over data.timeout, until removed
wait_values = chart.get('wait', {})
wait_timeout = wait_values.get('timeout', chart_timeout)
wait_labels = wait_values.get('labels', {})
# retrieve appropriate timeout value this_chart_should_wait = (
# TODO(MarshM): chart's `data.timeout` should be deprecated cg_sequenced or self.force_wait or
# to favor `data.wait.timeout` wait_timeout > 0 or len(wait_labels) > 0)
# TODO(MarshM) also: timeout logic seems to prefer chart values
# over api/cli, probably should swap? if this_chart_should_wait and wait_timeout <= 0:
# (caution: it always default to 3600, LOG.warn('No Chart timeout specified, using default: %ss',
# take care to differentiate user input) DEFAULT_CHART_TIMEOUT)
if tiller_should_wait and tiller_timeout == DEFAULT_TIMEOUT: wait_timeout = DEFAULT_CHART_TIMEOUT
tiller_timeout = chart.get('timeout', tiller_timeout)
wait_values = chart.get('wait', {}) # Naively take largest timeout to apply at end
wait_timeout = wait_values.get('timeout', tiller_timeout) # TODO(MarshM) better handling of timeout/timer
wait_values_labels = wait_values.get('labels', {}) cg_max_timeout = max(wait_timeout, cg_max_timeout)
# Chart test policy can override ChartGroup, if specified
test_this_chart = chart.get('test', cg_test_all_charts)
chartbuilder = ChartBuilder(chart) chartbuilder = ChartBuilder(chart)
protoc_chart = chartbuilder.get_helm_chart() protoc_chart = chartbuilder.get_helm_chart()
# determine install or upgrade by examining known releases
LOG.debug("RELEASE: %s", release)
deployed_releases = [x[0] for x in known_releases] deployed_releases = [x[0] for x in known_releases]
prefix_chart = release_prefix(prefix, release)
# Begin Chart timeout deadline
deadline = time.time() + wait_timeout
# TODO(mark-burnett): It may be more robust to directly call # TODO(mark-burnett): It may be more robust to directly call
# tiller status to decide whether to install/upgrade rather # tiller status to decide whether to install/upgrade rather
# than checking for list membership. # than checking for list membership.
if prefix_chart in deployed_releases: if release_name in deployed_releases:
# indicate to the end user what path we are taking # indicate to the end user what path we are taking
LOG.info("Upgrading release %s", release) LOG.info("Upgrading release %s in namespace %s",
release_name, namespace)
# extract the installed chart and installed values from the # extract the installed chart and installed values from the
# latest release so we can compare to the intended state # latest release so we can compare to the intended state
apply_chart, apply_values = self.find_release_chart( apply_chart, apply_values = self.find_release_chart(
known_releases, prefix_chart) known_releases, release_name)
upgrade = chart.get('upgrade', {}) upgrade = chart.get('upgrade', {})
disable_hooks = upgrade.get('no_hooks', False) disable_hooks = upgrade.get('no_hooks', False)
@ -329,7 +338,7 @@ class Armada(object):
if not self.disable_update_post and upgrade_post: if not self.disable_update_post and upgrade_post:
post_actions = upgrade_post post_actions = upgrade_post
# show delta for both the chart templates and the chart # Show delta for both the chart templates and the chart
# values # values
# TODO(alanmeadows) account for .files differences # TODO(alanmeadows) account for .files differences
# once we support those # once we support those
@ -342,80 +351,113 @@ class Armada(object):
LOG.info("There are no updates found in this chart") LOG.info("There are no updates found in this chart")
continue continue
# TODO(MarshM): Add tiller dry-run before upgrade and
# consider deadline impacts
# do actual update # do actual update
LOG.info('Beginning Upgrade, wait: %s, %s', timer = int(round(deadline - time.time()))
tiller_should_wait, wait_timeout) LOG.info('Beginning Upgrade, wait=%s, timeout=%ss',
self.tiller.update_release( this_chart_should_wait, timer)
tiller_result = self.tiller.update_release(
protoc_chart, protoc_chart,
prefix_chart, release_name,
namespace, namespace,
pre_actions=pre_actions, pre_actions=pre_actions,
post_actions=post_actions, post_actions=post_actions,
dry_run=self.dry_run, dry_run=self.dry_run,
disable_hooks=disable_hooks, disable_hooks=disable_hooks,
values=yaml.safe_dump(values), values=yaml.safe_dump(values),
wait=tiller_should_wait, wait=this_chart_should_wait,
timeout=wait_timeout) timeout=timer)
if tiller_should_wait: if this_chart_should_wait:
self.tiller.k8s.wait_until_ready( self.tiller.k8s.wait_until_ready(
release=prefix_chart, release=release_name,
labels=wait_values_labels, labels=wait_labels,
namespace=namespace, namespace=namespace,
k8s_wait_attempts=self.k8s_wait_attempts, k8s_wait_attempts=self.k8s_wait_attempts,
k8s_wait_attempt_sleep=self.k8s_wait_attempt_sleep, k8s_wait_attempt_sleep=self.k8s_wait_attempt_sleep,
timeout=wait_timeout timeout=timer
) )
msg['upgrade'].append(prefix_chart) LOG.info('Upgrade completed with results from Tiller: %s',
tiller_result.__dict__)
msg['upgrade'].append(release_name)
# process install # process install
else: else:
LOG.info("Installing release %s", release) LOG.info("Installing release %s in namespace %s",
LOG.info('Beginning Install, wait: %s, %s', release_name, namespace)
tiller_should_wait, wait_timeout)
self.tiller.install_release( timer = int(round(deadline - time.time()))
LOG.info('Beginning Install, wait=%s, timeout=%ss',
this_chart_should_wait, timer)
tiller_result = self.tiller.install_release(
protoc_chart, protoc_chart,
prefix_chart, release_name,
namespace, namespace,
dry_run=self.dry_run, dry_run=self.dry_run,
values=yaml.safe_dump(values), values=yaml.safe_dump(values),
wait=tiller_should_wait, wait=this_chart_should_wait,
timeout=wait_timeout) timeout=timer)
if tiller_should_wait: if this_chart_should_wait:
self.tiller.k8s.wait_until_ready( self.tiller.k8s.wait_until_ready(
release=prefix_chart, release=release_name,
labels=wait_values_labels, labels=wait_labels,
namespace=namespace, namespace=namespace,
k8s_wait_attempts=self.k8s_wait_attempts, k8s_wait_attempts=self.k8s_wait_attempts,
k8s_wait_attempt_sleep=self.k8s_wait_attempt_sleep, k8s_wait_attempt_sleep=self.k8s_wait_attempt_sleep,
timeout=wait_timeout timeout=timer
) )
msg['install'].append(prefix_chart) LOG.info('Install completed with results from Tiller: %s',
tiller_result.__dict__)
msg['install'].append(release_name)
LOG.debug("Cleaning up chart source in %s", # Sequenced ChartGroup should run tests after each Chart
chartbuilder.source_directory) timer = int(round(deadline - time.time()))
if test_this_chart and cg_sequenced:
LOG.info('Running sequenced test, timeout remaining: %ss.',
timer)
if timer <= 0:
reason = ('Timeout expired before testing sequenced '
'release %s' % release_name)
LOG.error(reason)
raise ArmadaTimeoutException(reason)
self._test_chart(release_name, timer)
if test_charts or (test_chart is True): # Un-sequenced ChartGroup should run tests at the end
LOG.info('Testing: %s', prefix_chart) elif test_this_chart:
resp = self.tiller.testing_release(prefix_chart) # Keeping track of time remaining
test_status = getattr(resp.info.status, tests_to_run.append((release_name, timer))
'last_test_suite_run', 'FAILED')
LOG.info("Test INFO: %s", test_status)
if resp:
LOG.info("PASSED: %s", prefix_chart)
else:
LOG.info("FAILED: %s", prefix_chart)
# TODO(MarshM) does this need release/labels/namespace? # End of Charts in ChartGroup
# TODO(MarshM) consider the tiller_timeout according to above logic LOG.info('All Charts applied.')
LOG.info('Wait after Chartgroup (%s) %ssec', desc, tiller_timeout)
self.tiller.k8s.wait_until_ready( # After all Charts are applied, we should wait for the entire
k8s_wait_attempts=self.k8s_wait_attempts, # ChartGroup to become healthy by looking at the namespaces seen
k8s_wait_attempt_sleep=self.k8s_wait_attempt_sleep, # TODO(MarshM): Need to restrict to only charts we processed
timeout=tiller_timeout) # TODO(MarshM): Need to determine a better timeout
deadline = time.time() + cg_max_timeout
for ns in namespaces_seen:
timer = int(round(deadline - time.time()))
LOG.info('Final wait for healthy namespace (%s), '
'timeout remaining: %ss.', ns, timer)
if timer <= 0:
reason = 'Timeout expired waiting on namespace: %s' % (ns)
LOG.error(reason)
raise ArmadaTimeoutException(reason)
self.tiller.k8s.wait_until_ready(
namespace=ns,
k8s_wait_attempts=self.k8s_wait_attempts,
k8s_wait_attempt_sleep=self.k8s_wait_attempt_sleep,
timeout=timer)
# After entire ChartGroup is healthy, run any pending tests
for (test, test_timer) in tests_to_run:
self._test_chart(test, test_timer)
LOG.info("Performing Post-Flight Operations") LOG.info("Performing Post-Flight Operations")
self.post_flight_ops() self.post_flight_ops()
@ -423,7 +465,7 @@ class Armada(object):
if self.enable_chart_cleanup: if self.enable_chart_cleanup:
self.tiller.chart_cleanup( self.tiller.chart_cleanup(
prefix, prefix,
self.manifest[const.KEYWORD_ARMADA][const.KEYWORD_GROUPS]) self.manifest[KEYWORD_ARMADA][KEYWORD_GROUPS])
return msg return msg
@ -432,53 +474,65 @@ class Armada(object):
Operations to run after deployment process has terminated Operations to run after deployment process has terminated
''' '''
# Delete temp dirs used for deployment # Delete temp dirs used for deployment
for group in self.manifest.get(const.KEYWORD_ARMADA, {}).get( for group in self.manifest.get(KEYWORD_ARMADA, {}).get(
const.KEYWORD_GROUPS, []): KEYWORD_GROUPS, []):
for ch in group.get(const.KEYWORD_CHARTS, []): for ch in group.get(KEYWORD_CHARTS, []):
chart = ch.get('chart', {}) chart = ch.get('chart', {})
if chart.get('source', {}).get('type') == 'git': if chart.get('source', {}).get('type') == 'git':
source_dir = chart.get('source_dir') source_dir = chart.get('source_dir')
if isinstance(source_dir, tuple) and source_dir: if isinstance(source_dir, tuple) and source_dir:
source.source_cleanup(source_dir[0]) source.source_cleanup(source_dir[0])
def _test_chart(self, release_name, timeout):
# TODO(MarshM): Fix testing, it's broken, and track timeout
resp = self.tiller.testing_release(release_name, timeout=timeout)
status = getattr(resp.info.status, 'last_test_suite_run', 'FAILED')
LOG.info("Test INFO: %s", status)
if resp:
LOG.info("PASSED: %s", release_name)
return True
else:
LOG.info("FAILED: %s", release_name)
return False
def show_diff(self, chart, installed_chart, installed_values, target_chart, def show_diff(self, chart, installed_chart, installed_values, target_chart,
target_values, msg): target_values, msg):
''' '''Produce a unified diff of the installed chart vs our intention'''
Produce a unified diff of the installed chart vs our intention
TODO(alanmeadows): This needs to be rewritten to produce better # TODO(MarshM) This gives decent output comparing values. Would be
unified diff output # nice to clean it up further. Are \\n or \n\n ever valid diffs?
''' # Can these be cleanly converted to dicts, for easier compare?
def _sanitize_diff_str(str):
return str.replace('\\n', '\n').replace('\n\n', '\n').split('\n')
source = str(installed_chart.SerializeToString()).split('\n') source = _sanitize_diff_str(str(installed_chart.SerializeToString()))
chart_diff = list( target = _sanitize_diff_str(str(target_chart))
difflib.unified_diff(source, str(target_chart).split('\n'))) chart_diff = list(difflib.unified_diff(source, target, n=0))
chart_release = chart.get('release', None) chart_release = chart.get('release', None)
if len(chart_diff) > 0: if len(chart_diff) > 0:
LOG.info("Found diff in Chart (%s)", chart_release)
diff_msg = [] diff_msg = []
for line in chart_diff: for line in chart_diff:
diff_msg.append(line) diff_msg.append(line)
msg['diff'].append({'chart': diff_msg}) msg['diff'].append({'chart': diff_msg})
pretty_diff = '\n'.join(diff_msg).replace(
'\\n', '\n').replace('\n\n', '\n') pretty_diff = '\n'.join(diff_msg)
LOG.info("Found diff in chart (%s)", chart_release)
LOG.debug(pretty_diff) LOG.debug(pretty_diff)
values_diff = list( source = _sanitize_diff_str(installed_values)
difflib.unified_diff( target = _sanitize_diff_str(yaml.safe_dump(target_values))
installed_values.split('\n'), values_diff = list(difflib.unified_diff(source, target, n=0))
yaml.safe_dump(target_values).split('\n')))
if len(values_diff) > 0: if len(values_diff) > 0:
LOG.info("Found diff in values (%s)", chart_release)
diff_msg = [] diff_msg = []
for line in values_diff: for line in values_diff:
diff_msg.append(line) diff_msg.append(line)
msg['diff'].append({'values': diff_msg}) msg['diff'].append({'values': diff_msg})
pretty_diff = '\n'.join(diff_msg).replace(
'\\n', '\n').replace('\n\n', '\n') pretty_diff = '\n'.join(diff_msg)
LOG.info("Found diff in chart values (%s)", chart_release)
LOG.debug(pretty_diff) LOG.debug(pretty_diff)
result = (len(chart_diff) > 0) or (len(values_diff) > 0) result = (len(chart_diff) > 0) or (len(values_diff) > 0)

View File

@ -22,6 +22,7 @@ from kubernetes.client.rest import ApiException
from oslo_config import cfg from oslo_config import cfg
from oslo_log import log as logging from oslo_log import log as logging
from armada.const import DEFAULT_K8S_TIMEOUT
from armada.utils.release import label_selectors from armada.utils.release import label_selectors
from armada.exceptions import k8s_exceptions as exceptions from armada.exceptions import k8s_exceptions as exceptions
@ -48,7 +49,8 @@ class K8s(object):
self.extension_api = client.ExtensionsV1beta1Api() self.extension_api = client.ExtensionsV1beta1Api()
def delete_job_action(self, name, namespace="default", def delete_job_action(self, name, namespace="default",
propagation_policy='Foreground'): propagation_policy='Foreground',
timeout=DEFAULT_K8S_TIMEOUT):
''' '''
:params name - name of the job :params name - name of the job
:params namespace - name of pod that job :params namespace - name of pod that job
@ -58,13 +60,36 @@ class K8s(object):
before the Job is marked as deleted. before the Job is marked as deleted.
''' '''
try: try:
LOG.debug('Deleting job %s, Wait timeout=%s', name, timeout)
body = client.V1DeleteOptions() body = client.V1DeleteOptions()
self.batch_api.delete_namespaced_job( w = watch.Watch()
name=name, namespace=namespace, body=body, issue_delete = True
propagation_policy=propagation_policy) for event in w.stream(self.batch_api.list_namespaced_job,
namespace=namespace,
timeout_seconds=timeout):
if issue_delete:
self.batch_api.delete_namespaced_job(
name=name, namespace=namespace, body=body,
propagation_policy=propagation_policy)
issue_delete = False
event_type = event['type'].upper()
job_name = event['object'].metadata.name
if event_type == 'DELETED' and job_name == name:
LOG.debug('Successfully deleted job %s', job_name)
return
err_msg = ('Reached timeout while waiting to delete job: '
'name=%s, namespace=%s' % (name, namespace))
LOG.error(err_msg)
raise exceptions.KubernetesWatchTimeoutException(err_msg)
except ApiException as e: except ApiException as e:
LOG.error("Exception when deleting job: name=%s, namespace=%s: %s", LOG.exception(
name, namespace, e) "Exception when deleting job: name=%s, namespace=%s",
name, namespace)
raise e
def get_namespace_job(self, namespace="default", label_selector=''): def get_namespace_job(self, namespace="default", label_selector=''):
''' '''
@ -188,7 +213,8 @@ class K8s(object):
LOG.info('New pod %s deployed', new_pod_name) LOG.info('New pod %s deployed', new_pod_name)
w.stop() w.stop()
def wait_get_completed_podphase(self, release, timeout=300): def wait_get_completed_podphase(self, release,
timeout=DEFAULT_K8S_TIMEOUT):
''' '''
:param release - part of namespace :param release - part of namespace
:param timeout - time before disconnecting stream :param timeout - time before disconnecting stream
@ -207,9 +233,9 @@ class K8s(object):
def wait_until_ready(self, def wait_until_ready(self,
release=None, release=None,
namespace='default', namespace='',
labels='', labels='',
timeout=300, timeout=DEFAULT_K8S_TIMEOUT,
k8s_wait_attempts=1, k8s_wait_attempts=1,
k8s_wait_attempt_sleep=1): k8s_wait_attempt_sleep=1):
''' '''
@ -232,10 +258,19 @@ class K8s(object):
sleep_time = (k8s_wait_attempt_sleep if k8s_wait_attempt_sleep >= 1 sleep_time = (k8s_wait_attempt_sleep if k8s_wait_attempt_sleep >= 1
else 1) else 1)
LOG.debug("Wait on %s (%s) for %s sec (k8s wait %s times, sleep %ss)", LOG.debug("Wait on namespace=(%s) labels=(%s) for %s sec "
"(k8s wait %s times, sleep %ss)",
namespace, label_selector, timeout, namespace, label_selector, timeout,
wait_attempts, sleep_time) wait_attempts, sleep_time)
if not namespace:
# This shouldn't be reachable
LOG.warn('"namespace" not specified, waiting across all available '
'namespaces is likely to cause unintended consequences.')
if not label_selector:
LOG.warn('"label_selector" not specified, waiting with no labels '
'may cause unintended consequences.')
deadline = time.time() + timeout deadline = time.time() + timeout
# NOTE(mark-burnett): Attempt to wait multiple times without # NOTE(mark-burnett): Attempt to wait multiple times without
@ -243,7 +278,7 @@ class K8s(object):
successes = 0 successes = 0
while successes < wait_attempts: while successes < wait_attempts:
deadline_remaining = int(deadline - time.time()) deadline_remaining = int(round(deadline - time.time()))
if deadline_remaining <= 0: if deadline_remaining <= 0:
return False return False
timed_out, modified_pods, unready_pods = self._wait_one_time( timed_out, modified_pods, unready_pods = self._wait_one_time(
@ -253,6 +288,9 @@ class K8s(object):
if timed_out: if timed_out:
LOG.info('Timed out waiting for pods: %s', LOG.info('Timed out waiting for pods: %s',
sorted(unready_pods)) sorted(unready_pods))
raise exceptions.KubernetesWatchTimeoutException(
'Timed out while waiting on namespace=(%s) labels=(%s)' %
(namespace, label_selector))
return False return False
if modified_pods: if modified_pods:
@ -276,17 +314,23 @@ class K8s(object):
w = watch.Watch() w = watch.Watch()
first_event = True first_event = True
# TODO(MarshM) still need to filter pods on namespace # Watch across specific namespace, or all
for event in w.stream(self.client.list_pod_for_all_namespaces, kwargs = {
label_selector=label_selector, 'label_selector': label_selector,
timeout_seconds=timeout): 'timeout_seconds': timeout,
}
if namespace:
func_to_call = self.client.list_namespaced_pod
kwargs['namespace'] = namespace
else:
func_to_call = self.client.list_pod_for_all_namespaces
for event in w.stream(func_to_call, **kwargs):
if first_event: if first_event:
pod_list = self.client.list_pod_for_all_namespaces( pod_list = func_to_call(**kwargs)
label_selector=label_selector,
timeout_seconds=timeout)
for pod in pod_list.items: for pod in pod_list.items:
LOG.debug('Setting up to wait for pod %s', LOG.debug('Setting up to wait for pod %s namespace=%s',
pod.metadata.name) pod.metadata.name, pod.metadata.namespace)
ready_pods[pod.metadata.name] = False ready_pods[pod.metadata.name] = False
first_event = False first_event = False

View File

@ -51,6 +51,16 @@ CONF = cfg.CONF
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
class TillerResult(object):
'''Object to hold Tiller results for Armada.'''
def __init__(self, release, namespace, status, description, version):
self.release = release
self.namespace = namespace
self.status = status
self.description = description
self.version = version
class Tiller(object): class Tiller(object):
''' '''
The Tiller class supports communication and requests to the Tiller Helm The Tiller class supports communication and requests to the Tiller Helm
@ -164,10 +174,14 @@ class Tiller(object):
''' '''
List Helm Releases List Helm Releases
''' '''
# TODO(MarshM possibly combine list_releases() with list_charts()
# since they do the same thing, grouping output differently
releases = [] releases = []
stub = ReleaseServiceStub(self.channel) stub = ReleaseServiceStub(self.channel)
# TODO(mark-burnett): Since we're limiting page size, we need to # TODO(mark-burnett): Since we're limiting page size, we need to
# iterate through all the pages when collecting this list. # iterate through all the pages when collecting this list.
# NOTE(MarshM): `Helm List` defaults to returning Deployed and Failed,
# but this might not be a desireable ListReleasesRequest default.
req = ListReleasesRequest(limit=RELEASE_LIMIT, req = ListReleasesRequest(limit=RELEASE_LIMIT,
status_codes=[STATUS_DEPLOYED, status_codes=[STATUS_DEPLOYED,
STATUS_FAILED], STATUS_FAILED],
@ -179,6 +193,7 @@ class Tiller(object):
metadata=self.metadata) metadata=self.metadata)
for y in release_list: for y in release_list:
# TODO(MarshM) this log is too noisy, fix later
# LOG.debug('Found release: %s', y.releases # LOG.debug('Found release: %s', y.releases
releases.extend(y.releases) releases.extend(y.releases)
@ -210,7 +225,7 @@ class Tiller(object):
return template return template
def _pre_update_actions(self, actions, release_name, namespace, chart, def _pre_update_actions(self, actions, release_name, namespace, chart,
disable_hooks, values): disable_hooks, values, timeout):
''' '''
:params actions - array of items actions :params actions - array of items actions
:params namespace - name of pod for actions :params namespace - name of pod for actions
@ -225,9 +240,10 @@ class Tiller(object):
self.rolling_upgrade_pod_deployment( self.rolling_upgrade_pod_deployment(
name, release_name, namespace, labels, name, release_name, namespace, labels,
action_type, chart, disable_hooks, values) action_type, chart, disable_hooks, values, timeout)
except Exception: except Exception:
LOG.warn("Pre: Could not update anything, please check yaml") LOG.warn("Pre: Could not update anything, please check yaml")
raise ex.PreUpdateJobDeleteException(name, namespace)
try: try:
for action in actions.get('delete', []): for action in actions.get('delete', []):
@ -235,8 +251,8 @@ class Tiller(object):
action_type = action.get('type') action_type = action.get('type')
labels = action.get('labels', None) labels = action.get('labels', None)
self.delete_resources( self.delete_resources(release_name, name, action_type,
release_name, name, action_type, labels, namespace) labels, namespace, timeout)
except Exception: except Exception:
LOG.warn("PRE: Could not delete anything, please check yaml") LOG.warn("PRE: Could not delete anything, please check yaml")
raise ex.PreUpdateJobDeleteException(name, namespace) raise ex.PreUpdateJobDeleteException(name, namespace)
@ -275,17 +291,22 @@ class Tiller(object):
Returns a list of tuples in the form: Returns a list of tuples in the form:
(name, version, chart, values, status) (name, version, chart, values, status)
''' '''
LOG.debug('Getting known releases from Tiller...')
charts = [] charts = []
for latest_release in self.list_releases(): for latest_release in self.list_releases():
try: try:
charts.append( release = (
(latest_release.name, latest_release.version, latest_release.name, latest_release.version,
latest_release.chart, latest_release.config.raw, latest_release.chart, latest_release.config.raw,
latest_release.info.status.Code.Name( latest_release.info.status.Code.Name(
latest_release.info.status.code))) latest_release.info.status.code))
except IndexError: charts.append(release)
LOG.debug('Found release %s, version %s, status: %s',
release[0], release[1], release[4])
except (AttributeError, IndexError) as e:
LOG.debug('%s while getting releases: %s, ex=%s',
e.__class__.__name__, latest_release, e)
continue continue
# LOG.debug('List of Helm Charts from Latest Releases: %s', charts)
return charts return charts
def update_release(self, chart, release, namespace, def update_release(self, chart, release, namespace,
@ -312,7 +333,7 @@ class Tiller(object):
values = Config(raw=values) values = Config(raw=values)
self._pre_update_actions(pre_actions, release, namespace, chart, self._pre_update_actions(pre_actions, release, namespace, chart,
disable_hooks, values) disable_hooks, values, timeout)
# build release install request # build release install request
try: try:
@ -329,7 +350,16 @@ class Tiller(object):
update_msg = stub.UpdateRelease( update_msg = stub.UpdateRelease(
release_request, rel_timeout + GRPC_EPSILON, release_request, rel_timeout + GRPC_EPSILON,
metadata=self.metadata) metadata=self.metadata)
return update_msg
tiller_result = TillerResult(
update_msg.release.name,
update_msg.release.namespace,
update_msg.release.info.status.Code.Name(
update_msg.release.info.status.code),
update_msg.release.info.Description,
update_msg.release.version)
return tiller_result
except Exception: except Exception:
LOG.exception('Error while updating release %s', release) LOG.exception('Error while updating release %s', release)
status = self.get_release_status(release) status = self.get_release_status(release)
@ -372,7 +402,16 @@ class Tiller(object):
install_msg = stub.InstallRelease( install_msg = stub.InstallRelease(
release_request, rel_timeout + GRPC_EPSILON, release_request, rel_timeout + GRPC_EPSILON,
metadata=self.metadata) metadata=self.metadata)
return install_msg
tiller_result = TillerResult(
install_msg.release.name,
install_msg.release.namespace,
install_msg.release.info.status.Code.Name(
install_msg.release.info.status.code),
install_msg.release.info.Description,
install_msg.release.version)
return tiller_result
except Exception: except Exception:
LOG.exception('Error while installing release %s', release) LOG.exception('Error while installing release %s', release)
status = self.get_release_status(release) status = self.get_release_status(release)
@ -527,7 +566,8 @@ class Tiller(object):
self.uninstall_release(chart) self.uninstall_release(chart)
def delete_resources(self, release_name, resource_name, resource_type, def delete_resources(self, release_name, resource_name, resource_type,
resource_labels, namespace, wait=False): resource_labels, namespace, wait=False,
timeout=TILLER_TIMEOUT):
''' '''
:params release_name - release name the specified resource is under :params release_name - release name the specified resource is under
:params resource_name - name of specific resource :params resource_name - name of specific resource
@ -541,16 +581,16 @@ class Tiller(object):
label_selector = '' label_selector = ''
if resource_labels is not None: if resource_labels is not None:
label_selector = label_selectors(resource_labels) label_selector = label_selectors(resource_labels)
LOG.debug("Deleting resources in namespace %s matching" LOG.debug("Deleting resources in namespace %s matching "
"selectors %s.", namespace, label_selector) "selectors %s.", namespace, label_selector)
if 'job' in resource_type: if 'job' in resource_type:
get_jobs = self.k8s.get_namespace_job(namespace, label_selector) get_jobs = self.k8s.get_namespace_job(namespace, label_selector)
for jb in get_jobs.items: for jb in get_jobs.items:
jb_name = jb.metadata.name jb_name = jb.metadata.name
LOG.info("Deleting %s in namespace: %s", jb_name, namespace) LOG.info("Deleting job %s in namespace: %s",
jb_name, namespace)
self.k8s.delete_job_action(jb_name, namespace) self.k8s.delete_job_action(jb_name, namespace, timeout=timeout)
elif 'pod' in resource_type: elif 'pod' in resource_type:
release_pods = self.k8s.get_namespace_pod( release_pods = self.k8s.get_namespace_pod(
@ -558,7 +598,8 @@ class Tiller(object):
for pod in release_pods.items: for pod in release_pods.items:
pod_name = pod.metadata.name pod_name = pod.metadata.name
LOG.info("Deleting %s in namespace: %s", pod_name, namespace) LOG.info("Deleting pod %s in namespace: %s",
pod_name, namespace)
self.k8s.delete_namespace_pod(pod_name, namespace) self.k8s.delete_namespace_pod(pod_name, namespace)
if wait: if wait:
self.k8s.wait_for_pod_redeployment(pod_name, namespace) self.k8s.wait_for_pod_redeployment(pod_name, namespace)
@ -568,7 +609,8 @@ class Tiller(object):
def rolling_upgrade_pod_deployment(self, name, release_name, namespace, def rolling_upgrade_pod_deployment(self, name, release_name, namespace,
resource_labels, action_type, chart, resource_labels, action_type, chart,
disable_hooks, values): disable_hooks, values,
timeout=TILLER_TIMEOUT):
''' '''
update statefullsets (daemon, stateful) update statefullsets (daemon, stateful)
''' '''
@ -607,7 +649,7 @@ class Tiller(object):
# delete pods # delete pods
self.delete_resources( self.delete_resources(
release_name, name, 'pod', resource_labels, namespace, release_name, name, 'pod', resource_labels, namespace,
wait=True) wait=True, timeout=timeout)
else: else:
LOG.error("Unable to exectue name: % type: %s", name, action_type) LOG.error("Unable to exectue name: % type: %s", name, action_type)

View File

@ -48,8 +48,8 @@ class ArmadaControllerTest(base.BaseControllerTest):
'disable_update_post': False, 'disable_update_post': False,
'enable_chart_cleanup': False, 'enable_chart_cleanup': False,
'dry_run': False, 'dry_run': False,
'tiller_should_wait': False, 'force_wait': False,
'tiller_timeout': 100, 'timeout': 100,
'tiller_host': None, 'tiller_host': None,
'tiller_port': 44134, 'tiller_port': 44134,
'tiller_namespace': 'kube-system', 'tiller_namespace': 'kube-system',

View File

@ -55,7 +55,8 @@ data:
location: /tmp/dummy/armada location: /tmp/dummy/armada
subpath: chart_2 subpath: chart_2
dependencies: [] dependencies: []
timeout: 5 wait:
timeout: 10
--- ---
schema: armada/Chart/v1 schema: armada/Chart/v1
metadata: metadata:
@ -72,7 +73,8 @@ data:
subpath: chart_1 subpath: chart_1
reference: master reference: master
dependencies: [] dependencies: []
timeout: 50 wait:
timeout: 10
""" """
CHART_SOURCES = [('git://github.com/dummy/armada', 'chart_1'), CHART_SOURCES = [('git://github.com/dummy/armada', 'chart_1'),
@ -104,8 +106,10 @@ class ArmadaHandlerTestCase(base.ArmadaTestCase):
'type': 'git' 'type': 'git'
}, },
'source_dir': CHART_SOURCES[0], 'source_dir': CHART_SOURCES[0],
'timeout': 50, 'values': {},
'values': {} 'wait': {
'timeout': 10
}
} }
}, },
{ {
@ -120,8 +124,10 @@ class ArmadaHandlerTestCase(base.ArmadaTestCase):
'type': 'local' 'type': 'local'
}, },
'source_dir': CHART_SOURCES[1], 'source_dir': CHART_SOURCES[1],
'timeout': 5, 'values': {},
'values': {} 'wait': {
'timeout': 10
}
} }
} }
], ],
@ -212,8 +218,8 @@ class ArmadaHandlerTestCase(base.ArmadaTestCase):
chart_1['namespace'], chart_1['namespace'],
dry_run=armada_obj.dry_run, dry_run=armada_obj.dry_run,
values=yaml.safe_dump(chart_1['values']), values=yaml.safe_dump(chart_1['values']),
wait=armada_obj.tiller_should_wait, timeout=10,
timeout=armada_obj.tiller_timeout), wait=True),
mock.call( mock.call(
mock_chartbuilder().get_helm_chart(), mock_chartbuilder().get_helm_chart(),
"{}-{}".format(armada_obj.manifest['armada']['release_prefix'], "{}-{}".format(armada_obj.manifest['armada']['release_prefix'],
@ -221,7 +227,7 @@ class ArmadaHandlerTestCase(base.ArmadaTestCase):
chart_2['namespace'], chart_2['namespace'],
dry_run=armada_obj.dry_run, dry_run=armada_obj.dry_run,
values=yaml.safe_dump(chart_2['values']), values=yaml.safe_dump(chart_2['values']),
wait=armada_obj.tiller_should_wait, timeout=10,
timeout=armada_obj.tiller_timeout) wait=True)
] ]
mock_tiller.return_value.install_release.assert_has_calls(method_calls) mock_tiller.return_value.install_release.assert_has_calls(method_calls)

View File

@ -18,6 +18,14 @@
:widths: 5 50 :widths: 5 50
:header-rows: 1 :header-rows: 1
* - Exception Name
- Description
* - ArmadaTimeoutException
- .. autoexception:: armada.exceptions.armada_exceptions.ArmadaTimeoutException
:members:
:show-inheritance:
:undoc-members:
* - Exception Name * - Exception Name
- Description - Description
* - KnownReleasesException * - KnownReleasesException

View File

@ -31,3 +31,8 @@
:members: :members:
:show-inheritance: :show-inheritance:
:undoc-members: :undoc-members:
* - KubernetesWatchTimeoutException
- .. autoexception:: armada.exceptions.k8s_exceptions.KubernetesWatchTimeoutException
:members:
:show-inheritance:
:undoc-members: