diff --git a/armada/api/controller/armada.py b/armada/api/controller/armada.py index e2dac5b0..c43f5e6f 100644 --- a/armada/api/controller/armada.py +++ b/armada/api/controller/armada.py @@ -80,8 +80,8 @@ class Apply(api.BaseResource): enable_chart_cleanup=req.get_param_as_bool( 'enable_chart_cleanup'), dry_run=req.get_param_as_bool('dry_run'), - tiller_should_wait=req.get_param_as_bool('wait'), - tiller_timeout=req.get_param_as_int('timeout') or 3600, + force_wait=req.get_param_as_bool('wait'), + timeout=req.get_param_as_int('timeout') or 0, tiller_host=req.get_param('tiller_host'), tiller_port=req.get_param_as_int( 'tiller_port') or CONF.tiller_port, diff --git a/armada/cli/apply.py b/armada/cli/apply.py index 85fef7b6..fef068ba 100644 --- a/armada/cli/apply.py +++ b/armada/cli/apply.py @@ -106,9 +106,10 @@ SHORT_DESC = "Command installs manifest charts." type=str, default=CONF.tiller_namespace) @click.option('--timeout', - help="Specifies time to wait for charts to deploy.", + help="Specifies time to wait for each chart to fully " + "finish deploying.", type=int, - default=3600) + default=0) @click.option('--values', '-f', help=("Use to override multiple Armada Manifest values by " "reading overrides from a values.yaml-type file."), @@ -116,7 +117,9 @@ SHORT_DESC = "Command installs manifest charts." type=str, default=[]) @click.option('--wait', - help="Wait until all charts deployed.", + help=("Force Tiller to wait until all charts are deployed, " + "rather than using each chart's specified wait policy. " + "This is equivalent to sequenced chartgroups."), is_flag=True) @click.option('--target-manifest', help=("The target manifest to run. Required for specifying " @@ -207,8 +210,8 @@ class ApplyManifest(CliAction): enable_chart_cleanup=self.enable_chart_cleanup, dry_run=self.dry_run, set_ovr=self.set, - tiller_should_wait=self.wait, - tiller_timeout=self.timeout, + force_wait=self.wait, + timeout=self.timeout, tiller_host=self.tiller_host, tiller_port=self.tiller_port, tiller_namespace=self.tiller_namespace, diff --git a/armada/const.py b/armada/const.py index 907b7512..63e8ca29 100644 --- a/armada/const.py +++ b/armada/const.py @@ -23,11 +23,14 @@ KEYWORD_PREFIX = 'release_prefix' KEYWORD_GROUPS = 'chart_groups' KEYWORD_CHARTS = 'chart_group' KEYWORD_RELEASE = 'release' -KEYWORD_CHART = 'chart' -# Statuses +# Tiller +DEFAULT_CHART_TIMEOUT = 3600 STATUS_DEPLOYED = 'DEPLOYED' STATUS_FAILED = 'FAILED' +# Kubernetes +DEFAULT_K8S_TIMEOUT = 300 + # Configuration File CONFIG_PATH = '/etc/armada' diff --git a/armada/exceptions/armada_exceptions.py b/armada/exceptions/armada_exceptions.py index 6dc719a2..544bb162 100644 --- a/armada/exceptions/armada_exceptions.py +++ b/armada/exceptions/armada_exceptions.py @@ -21,6 +21,14 @@ class ArmadaException(base_exception.ArmadaBaseException): message = 'An unknown Armada handler error occurred.' +class ArmadaTimeoutException(ArmadaException): + '''Exception that occurs when Armada times out while processing.''' + + def __init__(self, reason): + self._message = 'Armada timed out waiting on: %s' % (reason) + super(ArmadaTimeoutException, self).__init__(self._message) + + class KnownReleasesException(ArmadaException): ''' Exception that occurs when no known releases are found. diff --git a/armada/exceptions/k8s_exceptions.py b/armada/exceptions/k8s_exceptions.py index 82ffc6d9..35d8808e 100644 --- a/armada/exceptions/k8s_exceptions.py +++ b/armada/exceptions/k8s_exceptions.py @@ -18,7 +18,13 @@ from armada.exceptions.base_exception import ArmadaBaseException as ex class KubernetesException(ex): '''Base class for Kubernetes exceptions and error handling.''' - message = 'An unknown Kubernetes error occured.' + message = 'An unknown Kubernetes error occurred.' + + +class KubernetesWatchTimeoutException(KubernetesException): + '''Exception for timing out during a watch on a Kubernetes object''' + + message = 'Kubernetes Watch has timed out.' class KubernetesUnknownStreamingEventTypeException(KubernetesException): diff --git a/armada/handlers/armada.py b/armada/handlers/armada.py index d744dd71..696c9ded 100644 --- a/armada/handlers/armada.py +++ b/armada/handlers/armada.py @@ -13,6 +13,7 @@ # limitations under the License. import difflib +import time import yaml from oslo_config import cfg @@ -22,18 +23,23 @@ from armada.handlers.chartbuilder import ChartBuilder from armada.handlers.manifest import Manifest from armada.handlers.override import Override from armada.handlers.tiller import Tiller -from armada.exceptions import armada_exceptions +from armada.exceptions.armada_exceptions import ArmadaTimeoutException from armada.exceptions import source_exceptions from armada.exceptions import validate_exceptions from armada.exceptions import tiller_exceptions from armada.utils.release import release_prefix from armada.utils import source from armada.utils import validate -from armada import const + +from armada.const import DEFAULT_CHART_TIMEOUT +from armada.const import KEYWORD_ARMADA +from armada.const import KEYWORD_CHARTS +from armada.const import KEYWORD_GROUPS +from armada.const import KEYWORD_PREFIX +from armada.const import STATUS_FAILED LOG = logging.getLogger(__name__) CONF = cfg.CONF -DEFAULT_TIMEOUT = 3600 class Armada(object): @@ -49,8 +55,8 @@ class Armada(object): enable_chart_cleanup=False, dry_run=False, set_ovr=None, - tiller_should_wait=False, - tiller_timeout=DEFAULT_TIMEOUT, + force_wait=False, + timeout=0, tiller_host=None, tiller_port=None, tiller_namespace=None, @@ -67,10 +73,10 @@ class Armada(object): operations. :param bool enable_chart_cleanup: Clean up unmanaged charts. :param bool dry_run: Run charts without installing them. - :param bool tiller_should_wait: Specifies whether Tiller should wait - until all charts are deployed. - :param int tiller_timeout: Specifies time Tiller should wait for charts - to deploy until timing out. + :param bool force_wait: Force Tiller to wait until all charts are + deployed, rather than using each chart's specified wait policy. + :param int timeout: Specifies overall time in seconds that Tiller + should wait for charts until timing out. :param str tiller_host: Tiller host IP. Default is None. :param int tiller_port: Tiller host port. Default is ``CONF.tiller_port``. @@ -90,8 +96,8 @@ class Armada(object): self.disable_update_post = disable_update_post self.enable_chart_cleanup = enable_chart_cleanup self.dry_run = dry_run - self.tiller_should_wait = tiller_should_wait - self.tiller_timeout = tiller_timeout + self.force_wait = force_wait + self.timeout = timeout self.tiller = Tiller( tiller_host=tiller_host, tiller_port=tiller_port, tiller_namespace=tiller_namespace) @@ -140,13 +146,13 @@ class Armada(object): details=','.join([m.get('message') for m in msg_list])) # Purge known releases that have failed and are in the current yaml - armada_data = self.manifest.get(const.KEYWORD_ARMADA, {}) - prefix = armada_data.get(const.KEYWORD_PREFIX, '') - failed_releases = self.get_releases_by_status(const.STATUS_FAILED) + manifest_data = self.manifest.get(KEYWORD_ARMADA, {}) + prefix = manifest_data.get(KEYWORD_PREFIX, '') + failed_releases = self.get_releases_by_status(STATUS_FAILED) for release in failed_releases: - for group in armada_data.get(const.KEYWORD_GROUPS, []): - for ch in group.get(const.KEYWORD_CHARTS, []): + for group in manifest_data.get(KEYWORD_GROUPS, []): + for ch in group.get(KEYWORD_CHARTS, []): ch_release_name = release_prefix( prefix, ch.get('chart', {}).get('chart_name')) if release[0] == ch_release_name: @@ -159,8 +165,8 @@ class Armada(object): # We only support a git source type right now, which can also # handle git:// local paths as well repos = {} - for group in armada_data.get(const.KEYWORD_GROUPS, []): - for ch in group.get(const.KEYWORD_CHARTS, []): + for group in manifest_data.get(KEYWORD_GROUPS, []): + for ch in group.get(KEYWORD_CHARTS, []): self.tag_cloned_repo(ch, repos) for dep in ch.get('chart', {}).get('dependencies', []): @@ -241,79 +247,82 @@ class Armada(object): # extract known charts on tiller right now known_releases = self.tiller.list_charts() - armada_data = self.manifest.get(const.KEYWORD_ARMADA, {}) - prefix = armada_data.get(const.KEYWORD_PREFIX, '') + manifest_data = self.manifest.get(KEYWORD_ARMADA, {}) + prefix = manifest_data.get(KEYWORD_PREFIX, '') - # TODO(fmontei): This is a useless exception that is probably never - # thrown as `known_releases` is a list and the proper behavior here - # should be to return early. Fix this once all side effects of - # correcting this are well understood. - if known_releases is None: - raise armada_exceptions.KnownReleasesException() + for chartgroup in manifest_data.get(KEYWORD_GROUPS, []): + cg_name = chartgroup.get('name', '') + cg_desc = chartgroup.get('description', '') + LOG.info('Processing ChartGroup: %s (%s)', cg_name, cg_desc) - for release in known_releases: - LOG.debug("Release %s, Version %s found on Tiller", release[0], - release[1]) + cg_sequenced = chartgroup.get('sequenced', False) + cg_test_all_charts = chartgroup.get('test_charts', False) - for group in armada_data.get(const.KEYWORD_GROUPS, []): - tiller_should_wait = self.tiller_should_wait - tiller_timeout = self.tiller_timeout - desc = group.get('description', 'A Chart Group') - charts = group.get(const.KEYWORD_CHARTS, []) - test_charts = group.get('test_charts', False) + namespaces_seen = set() + tests_to_run = [] - if group.get('sequenced', False) or test_charts: - tiller_should_wait = True + cg_charts = chartgroup.get(KEYWORD_CHARTS, []) - LOG.info('Deploying: %s', desc) + # Track largest Chart timeout to stop the ChartGroup at the end + cg_max_timeout = 0 - for chart in charts: - chart = chart.get('chart', {}) + for chart_entry in cg_charts: + chart = chart_entry.get('chart', {}) + namespace = chart.get('namespace') + namespaces_seen.add(namespace) + release = chart.get('release') values = chart.get('values', {}) - test_chart = chart.get('test', False) - namespace = chart.get('namespace', None) - release = chart.get('release', None) pre_actions = {} post_actions = {} - if release is None: - continue + release_name = release_prefix(prefix, release) - if test_chart is True: - tiller_should_wait = True + # Retrieve appropriate timeout value + wait_timeout = self.timeout + if wait_timeout <= 0: + # TODO(MarshM): chart's `data.timeout` should be deprecated + chart_timeout = chart.get('timeout', 0) + # Favor data.wait.timeout over data.timeout, until removed + wait_values = chart.get('wait', {}) + wait_timeout = wait_values.get('timeout', chart_timeout) + wait_labels = wait_values.get('labels', {}) - # retrieve appropriate timeout value - # TODO(MarshM): chart's `data.timeout` should be deprecated - # to favor `data.wait.timeout` - # TODO(MarshM) also: timeout logic seems to prefer chart values - # over api/cli, probably should swap? - # (caution: it always default to 3600, - # take care to differentiate user input) - if tiller_should_wait and tiller_timeout == DEFAULT_TIMEOUT: - tiller_timeout = chart.get('timeout', tiller_timeout) - wait_values = chart.get('wait', {}) - wait_timeout = wait_values.get('timeout', tiller_timeout) - wait_values_labels = wait_values.get('labels', {}) + this_chart_should_wait = ( + cg_sequenced or self.force_wait or + wait_timeout > 0 or len(wait_labels) > 0) + + if this_chart_should_wait and wait_timeout <= 0: + LOG.warn('No Chart timeout specified, using default: %ss', + DEFAULT_CHART_TIMEOUT) + wait_timeout = DEFAULT_CHART_TIMEOUT + + # Naively take largest timeout to apply at end + # TODO(MarshM) better handling of timeout/timer + cg_max_timeout = max(wait_timeout, cg_max_timeout) + + # Chart test policy can override ChartGroup, if specified + test_this_chart = chart.get('test', cg_test_all_charts) chartbuilder = ChartBuilder(chart) protoc_chart = chartbuilder.get_helm_chart() - # determine install or upgrade by examining known releases - LOG.debug("RELEASE: %s", release) deployed_releases = [x[0] for x in known_releases] - prefix_chart = release_prefix(prefix, release) + + # Begin Chart timeout deadline + deadline = time.time() + wait_timeout # TODO(mark-burnett): It may be more robust to directly call # tiller status to decide whether to install/upgrade rather # than checking for list membership. - if prefix_chart in deployed_releases: + if release_name in deployed_releases: # indicate to the end user what path we are taking - LOG.info("Upgrading release %s", release) + LOG.info("Upgrading release %s in namespace %s", + release_name, namespace) # extract the installed chart and installed values from the # latest release so we can compare to the intended state apply_chart, apply_values = self.find_release_chart( - known_releases, prefix_chart) + known_releases, release_name) upgrade = chart.get('upgrade', {}) disable_hooks = upgrade.get('no_hooks', False) @@ -329,7 +338,7 @@ class Armada(object): if not self.disable_update_post and upgrade_post: post_actions = upgrade_post - # show delta for both the chart templates and the chart + # Show delta for both the chart templates and the chart # values # TODO(alanmeadows) account for .files differences # once we support those @@ -342,80 +351,113 @@ class Armada(object): LOG.info("There are no updates found in this chart") continue + # TODO(MarshM): Add tiller dry-run before upgrade and + # consider deadline impacts + # do actual update - LOG.info('Beginning Upgrade, wait: %s, %s', - tiller_should_wait, wait_timeout) - self.tiller.update_release( + timer = int(round(deadline - time.time())) + LOG.info('Beginning Upgrade, wait=%s, timeout=%ss', + this_chart_should_wait, timer) + tiller_result = self.tiller.update_release( protoc_chart, - prefix_chart, + release_name, namespace, pre_actions=pre_actions, post_actions=post_actions, dry_run=self.dry_run, disable_hooks=disable_hooks, values=yaml.safe_dump(values), - wait=tiller_should_wait, - timeout=wait_timeout) + wait=this_chart_should_wait, + timeout=timer) - if tiller_should_wait: + if this_chart_should_wait: self.tiller.k8s.wait_until_ready( - release=prefix_chart, - labels=wait_values_labels, + release=release_name, + labels=wait_labels, namespace=namespace, k8s_wait_attempts=self.k8s_wait_attempts, k8s_wait_attempt_sleep=self.k8s_wait_attempt_sleep, - timeout=wait_timeout + timeout=timer ) - msg['upgrade'].append(prefix_chart) + LOG.info('Upgrade completed with results from Tiller: %s', + tiller_result.__dict__) + msg['upgrade'].append(release_name) # process install else: - LOG.info("Installing release %s", release) - LOG.info('Beginning Install, wait: %s, %s', - tiller_should_wait, wait_timeout) - self.tiller.install_release( + LOG.info("Installing release %s in namespace %s", + release_name, namespace) + + timer = int(round(deadline - time.time())) + LOG.info('Beginning Install, wait=%s, timeout=%ss', + this_chart_should_wait, timer) + tiller_result = self.tiller.install_release( protoc_chart, - prefix_chart, + release_name, namespace, dry_run=self.dry_run, values=yaml.safe_dump(values), - wait=tiller_should_wait, - timeout=wait_timeout) + wait=this_chart_should_wait, + timeout=timer) - if tiller_should_wait: + if this_chart_should_wait: self.tiller.k8s.wait_until_ready( - release=prefix_chart, - labels=wait_values_labels, + release=release_name, + labels=wait_labels, namespace=namespace, k8s_wait_attempts=self.k8s_wait_attempts, k8s_wait_attempt_sleep=self.k8s_wait_attempt_sleep, - timeout=wait_timeout + timeout=timer ) - msg['install'].append(prefix_chart) + LOG.info('Install completed with results from Tiller: %s', + tiller_result.__dict__) + msg['install'].append(release_name) - LOG.debug("Cleaning up chart source in %s", - chartbuilder.source_directory) + # Sequenced ChartGroup should run tests after each Chart + timer = int(round(deadline - time.time())) + if test_this_chart and cg_sequenced: + LOG.info('Running sequenced test, timeout remaining: %ss.', + timer) + if timer <= 0: + reason = ('Timeout expired before testing sequenced ' + 'release %s' % release_name) + LOG.error(reason) + raise ArmadaTimeoutException(reason) + self._test_chart(release_name, timer) - if test_charts or (test_chart is True): - LOG.info('Testing: %s', prefix_chart) - resp = self.tiller.testing_release(prefix_chart) - test_status = getattr(resp.info.status, - 'last_test_suite_run', 'FAILED') - LOG.info("Test INFO: %s", test_status) - if resp: - LOG.info("PASSED: %s", prefix_chart) - else: - LOG.info("FAILED: %s", prefix_chart) + # Un-sequenced ChartGroup should run tests at the end + elif test_this_chart: + # Keeping track of time remaining + tests_to_run.append((release_name, timer)) - # TODO(MarshM) does this need release/labels/namespace? - # TODO(MarshM) consider the tiller_timeout according to above logic - LOG.info('Wait after Chartgroup (%s) %ssec', desc, tiller_timeout) - self.tiller.k8s.wait_until_ready( - k8s_wait_attempts=self.k8s_wait_attempts, - k8s_wait_attempt_sleep=self.k8s_wait_attempt_sleep, - timeout=tiller_timeout) + # End of Charts in ChartGroup + LOG.info('All Charts applied.') + + # After all Charts are applied, we should wait for the entire + # ChartGroup to become healthy by looking at the namespaces seen + # TODO(MarshM): Need to restrict to only charts we processed + # TODO(MarshM): Need to determine a better timeout + deadline = time.time() + cg_max_timeout + for ns in namespaces_seen: + timer = int(round(deadline - time.time())) + LOG.info('Final wait for healthy namespace (%s), ' + 'timeout remaining: %ss.', ns, timer) + if timer <= 0: + reason = 'Timeout expired waiting on namespace: %s' % (ns) + LOG.error(reason) + raise ArmadaTimeoutException(reason) + + self.tiller.k8s.wait_until_ready( + namespace=ns, + k8s_wait_attempts=self.k8s_wait_attempts, + k8s_wait_attempt_sleep=self.k8s_wait_attempt_sleep, + timeout=timer) + + # After entire ChartGroup is healthy, run any pending tests + for (test, test_timer) in tests_to_run: + self._test_chart(test, test_timer) LOG.info("Performing Post-Flight Operations") self.post_flight_ops() @@ -423,7 +465,7 @@ class Armada(object): if self.enable_chart_cleanup: self.tiller.chart_cleanup( prefix, - self.manifest[const.KEYWORD_ARMADA][const.KEYWORD_GROUPS]) + self.manifest[KEYWORD_ARMADA][KEYWORD_GROUPS]) return msg @@ -432,53 +474,65 @@ class Armada(object): Operations to run after deployment process has terminated ''' # Delete temp dirs used for deployment - for group in self.manifest.get(const.KEYWORD_ARMADA, {}).get( - const.KEYWORD_GROUPS, []): - for ch in group.get(const.KEYWORD_CHARTS, []): + for group in self.manifest.get(KEYWORD_ARMADA, {}).get( + KEYWORD_GROUPS, []): + for ch in group.get(KEYWORD_CHARTS, []): chart = ch.get('chart', {}) if chart.get('source', {}).get('type') == 'git': source_dir = chart.get('source_dir') if isinstance(source_dir, tuple) and source_dir: source.source_cleanup(source_dir[0]) + def _test_chart(self, release_name, timeout): + # TODO(MarshM): Fix testing, it's broken, and track timeout + resp = self.tiller.testing_release(release_name, timeout=timeout) + status = getattr(resp.info.status, 'last_test_suite_run', 'FAILED') + LOG.info("Test INFO: %s", status) + if resp: + LOG.info("PASSED: %s", release_name) + return True + else: + LOG.info("FAILED: %s", release_name) + return False + def show_diff(self, chart, installed_chart, installed_values, target_chart, target_values, msg): - ''' - Produce a unified diff of the installed chart vs our intention + '''Produce a unified diff of the installed chart vs our intention''' - TODO(alanmeadows): This needs to be rewritten to produce better - unified diff output - ''' + # TODO(MarshM) This gives decent output comparing values. Would be + # nice to clean it up further. Are \\n or \n\n ever valid diffs? + # Can these be cleanly converted to dicts, for easier compare? + def _sanitize_diff_str(str): + return str.replace('\\n', '\n').replace('\n\n', '\n').split('\n') - source = str(installed_chart.SerializeToString()).split('\n') - chart_diff = list( - difflib.unified_diff(source, str(target_chart).split('\n'))) + source = _sanitize_diff_str(str(installed_chart.SerializeToString())) + target = _sanitize_diff_str(str(target_chart)) + chart_diff = list(difflib.unified_diff(source, target, n=0)) chart_release = chart.get('release', None) if len(chart_diff) > 0: + LOG.info("Found diff in Chart (%s)", chart_release) diff_msg = [] for line in chart_diff: diff_msg.append(line) msg['diff'].append({'chart': diff_msg}) - pretty_diff = '\n'.join(diff_msg).replace( - '\\n', '\n').replace('\n\n', '\n') - LOG.info("Found diff in chart (%s)", chart_release) + + pretty_diff = '\n'.join(diff_msg) LOG.debug(pretty_diff) - values_diff = list( - difflib.unified_diff( - installed_values.split('\n'), - yaml.safe_dump(target_values).split('\n'))) + source = _sanitize_diff_str(installed_values) + target = _sanitize_diff_str(yaml.safe_dump(target_values)) + values_diff = list(difflib.unified_diff(source, target, n=0)) if len(values_diff) > 0: + LOG.info("Found diff in values (%s)", chart_release) diff_msg = [] for line in values_diff: diff_msg.append(line) msg['diff'].append({'values': diff_msg}) - pretty_diff = '\n'.join(diff_msg).replace( - '\\n', '\n').replace('\n\n', '\n') - LOG.info("Found diff in chart values (%s)", chart_release) + + pretty_diff = '\n'.join(diff_msg) LOG.debug(pretty_diff) result = (len(chart_diff) > 0) or (len(values_diff) > 0) diff --git a/armada/handlers/k8s.py b/armada/handlers/k8s.py index 758c7359..87e73442 100644 --- a/armada/handlers/k8s.py +++ b/armada/handlers/k8s.py @@ -22,6 +22,7 @@ from kubernetes.client.rest import ApiException from oslo_config import cfg from oslo_log import log as logging +from armada.const import DEFAULT_K8S_TIMEOUT from armada.utils.release import label_selectors from armada.exceptions import k8s_exceptions as exceptions @@ -48,7 +49,8 @@ class K8s(object): self.extension_api = client.ExtensionsV1beta1Api() def delete_job_action(self, name, namespace="default", - propagation_policy='Foreground'): + propagation_policy='Foreground', + timeout=DEFAULT_K8S_TIMEOUT): ''' :params name - name of the job :params namespace - name of pod that job @@ -58,13 +60,36 @@ class K8s(object): before the Job is marked as deleted. ''' try: + LOG.debug('Deleting job %s, Wait timeout=%s', name, timeout) body = client.V1DeleteOptions() - self.batch_api.delete_namespaced_job( - name=name, namespace=namespace, body=body, - propagation_policy=propagation_policy) + w = watch.Watch() + issue_delete = True + for event in w.stream(self.batch_api.list_namespaced_job, + namespace=namespace, + timeout_seconds=timeout): + if issue_delete: + self.batch_api.delete_namespaced_job( + name=name, namespace=namespace, body=body, + propagation_policy=propagation_policy) + issue_delete = False + + event_type = event['type'].upper() + job_name = event['object'].metadata.name + + if event_type == 'DELETED' and job_name == name: + LOG.debug('Successfully deleted job %s', job_name) + return + + err_msg = ('Reached timeout while waiting to delete job: ' + 'name=%s, namespace=%s' % (name, namespace)) + LOG.error(err_msg) + raise exceptions.KubernetesWatchTimeoutException(err_msg) + except ApiException as e: - LOG.error("Exception when deleting job: name=%s, namespace=%s: %s", - name, namespace, e) + LOG.exception( + "Exception when deleting job: name=%s, namespace=%s", + name, namespace) + raise e def get_namespace_job(self, namespace="default", label_selector=''): ''' @@ -188,7 +213,8 @@ class K8s(object): LOG.info('New pod %s deployed', new_pod_name) w.stop() - def wait_get_completed_podphase(self, release, timeout=300): + def wait_get_completed_podphase(self, release, + timeout=DEFAULT_K8S_TIMEOUT): ''' :param release - part of namespace :param timeout - time before disconnecting stream @@ -207,9 +233,9 @@ class K8s(object): def wait_until_ready(self, release=None, - namespace='default', + namespace='', labels='', - timeout=300, + timeout=DEFAULT_K8S_TIMEOUT, k8s_wait_attempts=1, k8s_wait_attempt_sleep=1): ''' @@ -232,10 +258,19 @@ class K8s(object): sleep_time = (k8s_wait_attempt_sleep if k8s_wait_attempt_sleep >= 1 else 1) - LOG.debug("Wait on %s (%s) for %s sec (k8s wait %s times, sleep %ss)", + LOG.debug("Wait on namespace=(%s) labels=(%s) for %s sec " + "(k8s wait %s times, sleep %ss)", namespace, label_selector, timeout, wait_attempts, sleep_time) + if not namespace: + # This shouldn't be reachable + LOG.warn('"namespace" not specified, waiting across all available ' + 'namespaces is likely to cause unintended consequences.') + if not label_selector: + LOG.warn('"label_selector" not specified, waiting with no labels ' + 'may cause unintended consequences.') + deadline = time.time() + timeout # NOTE(mark-burnett): Attempt to wait multiple times without @@ -243,7 +278,7 @@ class K8s(object): successes = 0 while successes < wait_attempts: - deadline_remaining = int(deadline - time.time()) + deadline_remaining = int(round(deadline - time.time())) if deadline_remaining <= 0: return False timed_out, modified_pods, unready_pods = self._wait_one_time( @@ -253,6 +288,9 @@ class K8s(object): if timed_out: LOG.info('Timed out waiting for pods: %s', sorted(unready_pods)) + raise exceptions.KubernetesWatchTimeoutException( + 'Timed out while waiting on namespace=(%s) labels=(%s)' % + (namespace, label_selector)) return False if modified_pods: @@ -276,17 +314,23 @@ class K8s(object): w = watch.Watch() first_event = True - # TODO(MarshM) still need to filter pods on namespace - for event in w.stream(self.client.list_pod_for_all_namespaces, - label_selector=label_selector, - timeout_seconds=timeout): + # Watch across specific namespace, or all + kwargs = { + 'label_selector': label_selector, + 'timeout_seconds': timeout, + } + if namespace: + func_to_call = self.client.list_namespaced_pod + kwargs['namespace'] = namespace + else: + func_to_call = self.client.list_pod_for_all_namespaces + + for event in w.stream(func_to_call, **kwargs): if first_event: - pod_list = self.client.list_pod_for_all_namespaces( - label_selector=label_selector, - timeout_seconds=timeout) + pod_list = func_to_call(**kwargs) for pod in pod_list.items: - LOG.debug('Setting up to wait for pod %s', - pod.metadata.name) + LOG.debug('Setting up to wait for pod %s namespace=%s', + pod.metadata.name, pod.metadata.namespace) ready_pods[pod.metadata.name] = False first_event = False diff --git a/armada/handlers/tiller.py b/armada/handlers/tiller.py index 6a02f3e9..06bd4af8 100644 --- a/armada/handlers/tiller.py +++ b/armada/handlers/tiller.py @@ -51,6 +51,16 @@ CONF = cfg.CONF LOG = logging.getLogger(__name__) +class TillerResult(object): + '''Object to hold Tiller results for Armada.''' + def __init__(self, release, namespace, status, description, version): + self.release = release + self.namespace = namespace + self.status = status + self.description = description + self.version = version + + class Tiller(object): ''' The Tiller class supports communication and requests to the Tiller Helm @@ -164,10 +174,14 @@ class Tiller(object): ''' List Helm Releases ''' + # TODO(MarshM possibly combine list_releases() with list_charts() + # since they do the same thing, grouping output differently releases = [] stub = ReleaseServiceStub(self.channel) # TODO(mark-burnett): Since we're limiting page size, we need to # iterate through all the pages when collecting this list. + # NOTE(MarshM): `Helm List` defaults to returning Deployed and Failed, + # but this might not be a desireable ListReleasesRequest default. req = ListReleasesRequest(limit=RELEASE_LIMIT, status_codes=[STATUS_DEPLOYED, STATUS_FAILED], @@ -179,6 +193,7 @@ class Tiller(object): metadata=self.metadata) for y in release_list: + # TODO(MarshM) this log is too noisy, fix later # LOG.debug('Found release: %s', y.releases releases.extend(y.releases) @@ -210,7 +225,7 @@ class Tiller(object): return template def _pre_update_actions(self, actions, release_name, namespace, chart, - disable_hooks, values): + disable_hooks, values, timeout): ''' :params actions - array of items actions :params namespace - name of pod for actions @@ -225,9 +240,10 @@ class Tiller(object): self.rolling_upgrade_pod_deployment( name, release_name, namespace, labels, - action_type, chart, disable_hooks, values) + action_type, chart, disable_hooks, values, timeout) except Exception: LOG.warn("Pre: Could not update anything, please check yaml") + raise ex.PreUpdateJobDeleteException(name, namespace) try: for action in actions.get('delete', []): @@ -235,8 +251,8 @@ class Tiller(object): action_type = action.get('type') labels = action.get('labels', None) - self.delete_resources( - release_name, name, action_type, labels, namespace) + self.delete_resources(release_name, name, action_type, + labels, namespace, timeout) except Exception: LOG.warn("PRE: Could not delete anything, please check yaml") raise ex.PreUpdateJobDeleteException(name, namespace) @@ -275,17 +291,22 @@ class Tiller(object): Returns a list of tuples in the form: (name, version, chart, values, status) ''' + LOG.debug('Getting known releases from Tiller...') charts = [] for latest_release in self.list_releases(): try: - charts.append( - (latest_release.name, latest_release.version, - latest_release.chart, latest_release.config.raw, - latest_release.info.status.Code.Name( - latest_release.info.status.code))) - except IndexError: + release = ( + latest_release.name, latest_release.version, + latest_release.chart, latest_release.config.raw, + latest_release.info.status.Code.Name( + latest_release.info.status.code)) + charts.append(release) + LOG.debug('Found release %s, version %s, status: %s', + release[0], release[1], release[4]) + except (AttributeError, IndexError) as e: + LOG.debug('%s while getting releases: %s, ex=%s', + e.__class__.__name__, latest_release, e) continue - # LOG.debug('List of Helm Charts from Latest Releases: %s', charts) return charts def update_release(self, chart, release, namespace, @@ -312,7 +333,7 @@ class Tiller(object): values = Config(raw=values) self._pre_update_actions(pre_actions, release, namespace, chart, - disable_hooks, values) + disable_hooks, values, timeout) # build release install request try: @@ -329,7 +350,16 @@ class Tiller(object): update_msg = stub.UpdateRelease( release_request, rel_timeout + GRPC_EPSILON, metadata=self.metadata) - return update_msg + + tiller_result = TillerResult( + update_msg.release.name, + update_msg.release.namespace, + update_msg.release.info.status.Code.Name( + update_msg.release.info.status.code), + update_msg.release.info.Description, + update_msg.release.version) + + return tiller_result except Exception: LOG.exception('Error while updating release %s', release) status = self.get_release_status(release) @@ -372,7 +402,16 @@ class Tiller(object): install_msg = stub.InstallRelease( release_request, rel_timeout + GRPC_EPSILON, metadata=self.metadata) - return install_msg + + tiller_result = TillerResult( + install_msg.release.name, + install_msg.release.namespace, + install_msg.release.info.status.Code.Name( + install_msg.release.info.status.code), + install_msg.release.info.Description, + install_msg.release.version) + + return tiller_result except Exception: LOG.exception('Error while installing release %s', release) status = self.get_release_status(release) @@ -527,7 +566,8 @@ class Tiller(object): self.uninstall_release(chart) def delete_resources(self, release_name, resource_name, resource_type, - resource_labels, namespace, wait=False): + resource_labels, namespace, wait=False, + timeout=TILLER_TIMEOUT): ''' :params release_name - release name the specified resource is under :params resource_name - name of specific resource @@ -541,16 +581,16 @@ class Tiller(object): label_selector = '' if resource_labels is not None: label_selector = label_selectors(resource_labels) - LOG.debug("Deleting resources in namespace %s matching" + LOG.debug("Deleting resources in namespace %s matching " "selectors %s.", namespace, label_selector) if 'job' in resource_type: get_jobs = self.k8s.get_namespace_job(namespace, label_selector) for jb in get_jobs.items: jb_name = jb.metadata.name - LOG.info("Deleting %s in namespace: %s", jb_name, namespace) - - self.k8s.delete_job_action(jb_name, namespace) + LOG.info("Deleting job %s in namespace: %s", + jb_name, namespace) + self.k8s.delete_job_action(jb_name, namespace, timeout=timeout) elif 'pod' in resource_type: release_pods = self.k8s.get_namespace_pod( @@ -558,7 +598,8 @@ class Tiller(object): for pod in release_pods.items: pod_name = pod.metadata.name - LOG.info("Deleting %s in namespace: %s", pod_name, namespace) + LOG.info("Deleting pod %s in namespace: %s", + pod_name, namespace) self.k8s.delete_namespace_pod(pod_name, namespace) if wait: self.k8s.wait_for_pod_redeployment(pod_name, namespace) @@ -568,7 +609,8 @@ class Tiller(object): def rolling_upgrade_pod_deployment(self, name, release_name, namespace, resource_labels, action_type, chart, - disable_hooks, values): + disable_hooks, values, + timeout=TILLER_TIMEOUT): ''' update statefullsets (daemon, stateful) ''' @@ -607,7 +649,7 @@ class Tiller(object): # delete pods self.delete_resources( release_name, name, 'pod', resource_labels, namespace, - wait=True) + wait=True, timeout=timeout) else: LOG.error("Unable to exectue name: % type: %s", name, action_type) diff --git a/armada/tests/unit/api/test_armada_controller.py b/armada/tests/unit/api/test_armada_controller.py index 46d7e682..ca44dac6 100644 --- a/armada/tests/unit/api/test_armada_controller.py +++ b/armada/tests/unit/api/test_armada_controller.py @@ -48,8 +48,8 @@ class ArmadaControllerTest(base.BaseControllerTest): 'disable_update_post': False, 'enable_chart_cleanup': False, 'dry_run': False, - 'tiller_should_wait': False, - 'tiller_timeout': 100, + 'force_wait': False, + 'timeout': 100, 'tiller_host': None, 'tiller_port': 44134, 'tiller_namespace': 'kube-system', diff --git a/armada/tests/unit/handlers/test_armada.py b/armada/tests/unit/handlers/test_armada.py index cf2f86a5..fb3fa4e8 100644 --- a/armada/tests/unit/handlers/test_armada.py +++ b/armada/tests/unit/handlers/test_armada.py @@ -55,7 +55,8 @@ data: location: /tmp/dummy/armada subpath: chart_2 dependencies: [] - timeout: 5 + wait: + timeout: 10 --- schema: armada/Chart/v1 metadata: @@ -72,7 +73,8 @@ data: subpath: chart_1 reference: master dependencies: [] - timeout: 50 + wait: + timeout: 10 """ CHART_SOURCES = [('git://github.com/dummy/armada', 'chart_1'), @@ -104,8 +106,10 @@ class ArmadaHandlerTestCase(base.ArmadaTestCase): 'type': 'git' }, 'source_dir': CHART_SOURCES[0], - 'timeout': 50, - 'values': {} + 'values': {}, + 'wait': { + 'timeout': 10 + } } }, { @@ -120,8 +124,10 @@ class ArmadaHandlerTestCase(base.ArmadaTestCase): 'type': 'local' }, 'source_dir': CHART_SOURCES[1], - 'timeout': 5, - 'values': {} + 'values': {}, + 'wait': { + 'timeout': 10 + } } } ], @@ -212,8 +218,8 @@ class ArmadaHandlerTestCase(base.ArmadaTestCase): chart_1['namespace'], dry_run=armada_obj.dry_run, values=yaml.safe_dump(chart_1['values']), - wait=armada_obj.tiller_should_wait, - timeout=armada_obj.tiller_timeout), + timeout=10, + wait=True), mock.call( mock_chartbuilder().get_helm_chart(), "{}-{}".format(armada_obj.manifest['armada']['release_prefix'], @@ -221,7 +227,7 @@ class ArmadaHandlerTestCase(base.ArmadaTestCase): chart_2['namespace'], dry_run=armada_obj.dry_run, values=yaml.safe_dump(chart_2['values']), - wait=armada_obj.tiller_should_wait, - timeout=armada_obj.tiller_timeout) + timeout=10, + wait=True) ] mock_tiller.return_value.install_release.assert_has_calls(method_calls) diff --git a/docs/source/operations/exceptions/armada-exceptions.inc b/docs/source/operations/exceptions/armada-exceptions.inc index 58523bfb..0fb27120 100644 --- a/docs/source/operations/exceptions/armada-exceptions.inc +++ b/docs/source/operations/exceptions/armada-exceptions.inc @@ -18,6 +18,14 @@ :widths: 5 50 :header-rows: 1 + * - Exception Name + - Description + * - ArmadaTimeoutException + - .. autoexception:: armada.exceptions.armada_exceptions.ArmadaTimeoutException + :members: + :show-inheritance: + :undoc-members: + * - Exception Name - Description * - KnownReleasesException diff --git a/docs/source/operations/exceptions/k8s-exceptions.inc b/docs/source/operations/exceptions/k8s-exceptions.inc index 187c0428..24e89459 100644 --- a/docs/source/operations/exceptions/k8s-exceptions.inc +++ b/docs/source/operations/exceptions/k8s-exceptions.inc @@ -31,3 +31,8 @@ :members: :show-inheritance: :undoc-members: + * - KubernetesWatchTimeoutException + - .. autoexception:: armada.exceptions.k8s_exceptions.KubernetesWatchTimeoutException + :members: + :show-inheritance: + :undoc-members: