armada/armada/handlers/armada.py

# Copyright 2017 The Armada Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import functools
import time
import yaml

from oslo_config import cfg
from oslo_log import log as logging

from armada import const
from armada.exceptions import armada_exceptions
from armada.exceptions import override_exceptions
from armada.exceptions import source_exceptions
from armada.exceptions import tiller_exceptions
from armada.exceptions import validate_exceptions
from armada.handlers.chartbuilder import ChartBuilder
from armada.handlers.manifest import Manifest
from armada.handlers.override import Override
from armada.handlers.release_diff import ReleaseDiff
from armada.handlers.test import test_release_for_success
from armada.handlers.tiller import Tiller
from armada.handlers.wait import get_wait_for
from armada.utils.release import release_prefixer
from armada.utils import source

LOG = logging.getLogger(__name__)
CONF = cfg.CONF


class Armada(object):
    '''
    This is the main Armada class handling the Armada
    workflows
    '''

    def __init__(self,
                 documents,
                 disable_update_pre=False,
                 disable_update_post=False,
                 enable_chart_cleanup=False,
                 dry_run=False,
                 set_ovr=None,
                 force_wait=False,
                 timeout=0,
                 tiller_host=None,
                 tiller_port=None,
                 tiller_namespace=None,
                 values=None,
                 target_manifest=None,
                 k8s_wait_attempts=1,
                 k8s_wait_attempt_sleep=1):
        '''
        Initialize the Armada engine and establish a connection to Tiller.

        :param List[dict] documents: Armada documents.
        :param bool disable_update_pre: Disable pre-update Tiller operations.
        :param bool disable_update_post: Disable post-update Tiller
            operations.
        :param bool enable_chart_cleanup: Clean up unmanaged charts.
        :param bool dry_run: Run charts without installing them.
        :param bool force_wait: Force Tiller to wait until all charts are
            deployed, rather than using each chart's specified wait policy.
        :param int timeout: Specifies overall time in seconds that Tiller
            should wait for charts until timing out.
        :param str tiller_host: Tiller host IP. Default is None.
        :param int tiller_port: Tiller host port. Default is
            ``CONF.tiller_port``.
        :param str tiller_namespace: Tiller host namespace. Default is
            ``CONF.tiller_namespace``.
        :param str target_manifest: The target manifest to run. Useful for
            specifying which manifest to run when multiple are available.
        :param int k8s_wait_attempts: The number of times to attempt waiting
            for pods to become ready.
        :param int k8s_wait_attempt_sleep: The time in seconds to sleep
            between attempts.
        '''
        tiller_port = tiller_port or CONF.tiller_port
        tiller_namespace = tiller_namespace or CONF.tiller_namespace

        self.disable_update_pre = disable_update_pre
        self.disable_update_post = disable_update_post
        self.enable_chart_cleanup = enable_chart_cleanup
        self.dry_run = dry_run
        self.force_wait = force_wait
        self.timeout = timeout
        # TODO: Use dependency injection i.e. pass in a Tiller instead of
        #       creating it here.
        self.tiller = Tiller(
            tiller_host=tiller_host,
            tiller_port=tiller_port,
            tiller_namespace=tiller_namespace,
            dry_run=dry_run)
        try:
            self.documents = Override(
                documents, overrides=set_ovr,
                values=values).update_manifests()
        except (validate_exceptions.InvalidManifestException,
                override_exceptions.InvalidOverrideValueException):
            raise
        self.k8s_wait_attempts = k8s_wait_attempts
        self.k8s_wait_attempt_sleep = k8s_wait_attempt_sleep
        self.manifest = Manifest(
            self.documents, target_manifest=target_manifest).get_manifest()
        self.cloned_dirs = set()

    def find_release_chart(self, known_releases, release_name):
        '''
        Find a release given a list of known_releases and a release name
        '''
        for release, _, chart, values, _ in known_releases:
            if release == release_name:
                return chart, values

    def pre_flight_ops(self):
        """Perform a series of checks and operations to ensure proper
        deployment.
        """
        LOG.info("Performing pre-flight operations.")

        # Ensure Tiller is available and manifest is valid
        if not self.tiller.tiller_status():
            raise tiller_exceptions.TillerServicesUnavailableException()

        # Clone the chart sources
        repos = {}
        manifest_data = self.manifest.get(const.KEYWORD_ARMADA, {})
        for group in manifest_data.get(const.KEYWORD_GROUPS, []):
            for ch in group.get(const.KEYWORD_CHARTS, []):
                self.tag_cloned_repo(ch, repos)

                for dep in ch.get('chart', {}).get('dependencies', []):
                    self.tag_cloned_repo(dep, repos)

    def tag_cloned_repo(self, ch, repos):
        chart = ch.get('chart', {})
        chart_source = chart.get('source', {})
        location = chart_source.get('location')
        ct_type = chart_source.get('type')
        subpath = chart_source.get('subpath', '.')

        if ct_type == 'local':
            chart['source_dir'] = (location, subpath)
        elif ct_type == 'tar':
            LOG.info('Downloading tarball from: %s', location)

            if not CONF.certs:
                LOG.warn('Disabling server validation certs to extract charts')
                tarball_dir = source.get_tarball(location, verify=False)
            else:
                tarball_dir = source.get_tarball(location, verify=CONF.cert)

            chart['source_dir'] = (tarball_dir, subpath)
        elif ct_type == 'git':
            reference = chart_source.get('reference', 'master')
            repo_branch = (location, reference)

            if repo_branch not in repos:
                auth_method = chart_source.get('auth_method')
                proxy_server = chart_source.get('proxy_server')

                logstr = 'Cloning repo: {} from branch: {}'.format(
                    *repo_branch)
                if proxy_server:
                    logstr += ' proxy: {}'.format(proxy_server)
                if auth_method:
                    logstr += ' auth method: {}'.format(auth_method)
                LOG.info(logstr)

                repo_dir = source.git_clone(
                    *repo_branch,
                    proxy_server=proxy_server,
                    auth_method=auth_method)
                self.cloned_dirs.add(repo_dir)

                repos[repo_branch] = repo_dir
                chart['source_dir'] = (repo_dir, subpath)
            else:
                chart['source_dir'] = (repos.get(repo_branch), subpath)
        else:
            chart_name = chart.get('chart_name')
            raise source_exceptions.ChartSourceException(ct_type, chart_name)

    def _get_releases_by_status(self):
        '''
        Return a list of current releases with DEPLOYED or FAILED status
        '''
        deployed_releases = []
        failed_releases = []
        known_releases = self.tiller.list_charts()
        for release in known_releases:
            if release[4] == const.STATUS_DEPLOYED:
                deployed_releases.append(release)
            elif release[4] == const.STATUS_FAILED:
                failed_releases.append(release)
            else:
                # tiller.list_charts() only looks at DEPLOYED/FAILED so
                # this should be unreachable
                LOG.debug('Ignoring release %s in status %s.', release[0],
                          release[4])

        return deployed_releases, failed_releases

    def sync(self):
        '''
        Synchronize Helm with the Armada Config(s)
        '''
        if self.dry_run:
            LOG.info('Armada is in DRY RUN mode, no changes being made.')

        msg = {
            'install': [],
            'upgrade': [],
            'diff': [],
            'purge': [],
            'protected': []
        }

        # TODO: (gardlt) we need to break up this func into
        # a more cleaner format
        self.pre_flight_ops()

        # extract known charts on tiller right now
        deployed_releases, failed_releases = self._get_releases_by_status()

        manifest_data = self.manifest.get(const.KEYWORD_ARMADA, {})
        prefix = manifest_data.get(const.KEYWORD_PREFIX)

        for chartgroup in manifest_data.get(const.KEYWORD_GROUPS, []):
            cg_name = chartgroup.get('name', '<missing name>')
            cg_desc = chartgroup.get('description', '<missing description>')
            cg_sequenced = chartgroup.get('sequenced', False)
            LOG.info('Processing ChartGroup: %s (%s), sequenced=%s', cg_name,
                     cg_desc, cg_sequenced)

            # TODO(MarshM): Deprecate the `test_charts` key
            cg_test_all_charts = chartgroup.get('test_charts')
            if isinstance(cg_test_all_charts, bool):
                LOG.warn('The ChartGroup `test_charts` key is deprecated, '
                         'and support for this will be removed. See the '
                         'Chart `test` key for more information.')
            else:
                # This key defaults to True. Individual charts must
                # explicitly disable helm tests if they choose
                cg_test_all_charts = True

            ns_label_set = set()
            tests_to_run = []

            cg_charts = chartgroup.get(const.KEYWORD_CHARTS, [])

            # Track largest Chart timeout to stop the ChartGroup at the end
            cg_max_timeout = 0

            for chart_entry in cg_charts:
                chart = chart_entry.get('chart', {})
                namespace = chart.get('namespace')
                release = chart.get('release')
                release_name = release_prefixer(prefix, release)
                LOG.info('Processing Chart, release=%s', release_name)

                values = chart.get('values', {})
                pre_actions = {}
                post_actions = {}

                protected = chart.get('protected', {})
                p_continue = protected.get('continue_processing', False)

                # Check for existing FAILED release, and purge
                if release_name in [rel[0] for rel in failed_releases]:
                    LOG.info('Purging FAILED release %s before deployment.',
                             release_name)
                    if protected:
                        if p_continue:
                            LOG.warn(
                                'Release %s is `protected`, '
                                'continue_processing=True. Operator must '
                                'handle FAILED release manually.',
                                release_name)
                            msg['protected'].append(release_name)
                            continue
                        else:
                            LOG.error(
                                'Release %s is `protected`, '
                                'continue_processing=False.', release_name)
                            raise armada_exceptions.ProtectedReleaseException(
                                release_name)
                    else:
                        # Purge the release
                        self.tiller.uninstall_release(release_name)
                        msg['purge'].append(release_name)

                # NOTE(MarshM): Calculating `wait_timeout` is unfortunately
                #   overly complex. The order of precedence is currently:
                #   1) User provided override via API/CLI (default 0 if not
                #      provided by client/user).
                #   2) Chart's `data.wait.timeout`, or...
                #   3) Chart's `data.timeout` (deprecated).
                #   4) const.DEFAULT_CHART_TIMEOUT, if nothing is ever
                #      specified, for use in waiting for final ChartGroup
                #      health and helm tests, but ignored for the actual
                #      install/upgrade of the Chart.
                # NOTE(MarshM): Not defining a timeout has a side effect of
                #   allowing Armada to install charts with a circular
                #   dependency defined between components.

                # TODO(MarshM): Deprecated, remove the following block
                deprecated_timeout = chart.get('timeout', None)
                if isinstance(deprecated_timeout, int):
                    LOG.warn('The `timeout` key is deprecated and support '
                             'for this will be removed soon. Use '
                             '`wait.timeout` instead.')

                wait_values = chart.get('wait', {})
                wait_labels = wait_values.get('labels', {})
                wait_timeout = self.timeout
                if wait_timeout <= 0:
                    wait_timeout = wait_values.get('timeout', wait_timeout)
                    # TODO(MarshM): Deprecated, remove the following check
                    if wait_timeout <= 0:
                        wait_timeout = deprecated_timeout or wait_timeout

                # Determine wait logic
                # NOTE(Dan Kim): Conditions to wait are below :
                # 1) set sequenced=True in chart group
                # 2) set force_wait param
                # 3) add Chart's `data.wait.timeout`
                # --timeout param will do not set wait=True, it just change
                # max timeout of chart's deployment. (default: 900)
                this_chart_should_wait = (cg_sequenced or self.force_wait or
                                          (bool(wait_values) and
                                           (wait_timeout > 0)))

                # If there is still no timeout, we need to use a default
                # (item 4 in note above)
                if wait_timeout <= 0:
                    LOG.warn('No Chart timeout specified, using default: %ss',
                             const.DEFAULT_CHART_TIMEOUT)
                    wait_timeout = const.DEFAULT_CHART_TIMEOUT

                # Naively take largest timeout to apply at end
                # TODO(MarshM) better handling of timeout/timer
                cg_max_timeout = max(wait_timeout, cg_max_timeout)

                test_chart_override = chart.get('test')
                # Use old default value when not using newer `test` key
                test_cleanup = True
                if test_chart_override is None:
                    test_this_chart = cg_test_all_charts
                elif isinstance(test_chart_override, bool):
                    LOG.warn('Boolean value for chart `test` key is'
                             ' deprecated and support for this will'
                             ' be removed. Use `test.enabled` '
                             'instead.')
                    test_this_chart = test_chart_override
                else:
                    # NOTE: helm tests are enabled by default
                    test_this_chart = test_chart_override.get('enabled', True)
                    test_cleanup = test_chart_override.get('options', {}).get(
                        'cleanup', False)

                chartbuilder = ChartBuilder(chart)
                new_chart = chartbuilder.get_helm_chart()

                # Begin Chart timeout deadline
                deadline = time.time() + wait_timeout

                # TODO(mark-burnett): It may be more robust to directly call
                # tiller status to decide whether to install/upgrade rather
                # than checking for list membership.
                if release_name in [rel[0] for rel in deployed_releases]:

                    # indicate to the end user what path we are taking
                    LOG.info("Upgrading release %s in namespace %s",
                             release_name, namespace)
                    # extract the installed chart and installed values from the
                    # latest release so we can compare to the intended state
                    old_chart, old_values_string = self.find_release_chart(
                        deployed_releases, release_name)

                    upgrade = chart.get('upgrade', {})
                    disable_hooks = upgrade.get('no_hooks', False)
                    force = upgrade.get('force', False)
                    recreate_pods = upgrade.get('recreate_pods', False)

                    LOG.info("Checking Pre/Post Actions")
                    if upgrade:
                        upgrade_pre = upgrade.get('pre', {})
                        upgrade_post = upgrade.get('post', {})

                        if not self.disable_update_pre and upgrade_pre:
                            pre_actions = upgrade_pre

                        if not self.disable_update_post and upgrade_post:
                            post_actions = upgrade_post

                    try:
                        old_values = yaml.safe_load(old_values_string)
                    except yaml.YAMLError:
                        chart_desc = '{} (previously deployed)'.format(
                            old_chart.metadata.name)
                        raise armada_exceptions.\
                            InvalidOverrideValuesYamlException(chart_desc)

                    LOG.info('Checking for updates to chart release inputs.')
                    diff = self.get_diff(old_chart, old_values, new_chart,
                                         values)

                    if not diff:
                        LOG.info("Found no updates to chart release inputs")
                        continue

                    LOG.info("Found updates to chart release inputs")
                    LOG.debug("%s", diff)
                    msg['diff'].append({chart['release']: str(diff)})

                    # TODO(MarshM): Add tiller dry-run before upgrade and
                    # consider deadline impacts

                    # do actual update
                    timer = int(round(deadline - time.time()))
                    LOG.info('Beginning Upgrade, wait=%s, timeout=%ss',
                             this_chart_should_wait, timer)
                    tiller_result = self.tiller.update_release(
                        new_chart,
                        release_name,
                        namespace,
                        pre_actions=pre_actions,
                        post_actions=post_actions,
                        disable_hooks=disable_hooks,
                        values=yaml.safe_dump(values),
                        wait=this_chart_should_wait,
                        timeout=timer,
                        force=force,
                        recreate_pods=recreate_pods)

                    if this_chart_should_wait:
                        self._wait_until_ready(release_name, wait_labels,
                                               namespace, timer)

                    # Track namespace+labels touched by upgrade
                    ns_label_set.add((namespace, tuple(wait_labels.items())))

                    LOG.info('Upgrade completed with results from Tiller: %s',
                             tiller_result.__dict__)
                    msg['upgrade'].append(release_name)

                # process install
                else:
                    LOG.info("Installing release %s in namespace %s",
                             release_name, namespace)

                    timer = int(round(deadline - time.time()))
                    LOG.info('Beginning Install, wait=%s, timeout=%ss',
                             this_chart_should_wait, timer)
                    tiller_result = self.tiller.install_release(
                        new_chart,
                        release_name,
                        namespace,
                        values=yaml.safe_dump(values),
                        wait=this_chart_should_wait,
                        timeout=timer)

                    if this_chart_should_wait:
                        self._wait_until_ready(release_name, wait_labels,
                                               namespace, timer)

                    # Track namespace+labels touched by install
                    ns_label_set.add((namespace, tuple(wait_labels.items())))

                    LOG.info('Install completed with results from Tiller: %s',
                             tiller_result.__dict__)
                    msg['install'].append(release_name)

                # Keeping track of time remaining
                timer = int(round(deadline - time.time()))
                test_chart_args = (release_name, timer, test_cleanup)
                if test_this_chart:
                    # Sequenced ChartGroup should run tests after each Chart
                    if cg_sequenced:
                        LOG.info(
                            'Running sequenced test, timeout remaining: '
                            '%ss.', timer)
                        self._test_chart(*test_chart_args)

                    # Un-sequenced ChartGroup should run tests at the end
                    else:
                        tests_to_run.append(
                            functools.partial(self._test_chart,
                                              *test_chart_args))

            # End of Charts in ChartGroup
            LOG.info('All Charts applied in ChartGroup %s.', cg_name)

            # After all Charts are applied, we should wait for the entire
            # ChartGroup to become healthy by looking at the namespaces seen
            # TODO(MarshM): Need to determine a better timeout
            #               (not cg_max_timeout)
            if cg_max_timeout <= 0:
                cg_max_timeout = const.DEFAULT_CHART_TIMEOUT
            deadline = time.time() + cg_max_timeout
            for (ns, labels) in ns_label_set:
                labels_dict = dict(labels)
                timer = int(round(deadline - time.time()))
                LOG.info(
                    'Final ChartGroup wait for healthy namespace=%s, '
                    'labels=(%s), timeout remaining: %ss.', ns, labels_dict,
                    timer)
                if timer <= 0:
                    reason = ('Timeout expired waiting on namespace: %s, '
                              'labels: (%s)' % (ns, labels_dict))
                    LOG.error(reason)
                    raise armada_exceptions.ArmadaTimeoutException(reason)

                self._wait_until_ready(
                    release_name=None,
                    wait_labels=labels_dict,
                    namespace=ns,
                    timeout=timer)

            # After entire ChartGroup is healthy, run any pending tests
            for callback in tests_to_run:
                callback()

        self.post_flight_ops()

        if self.enable_chart_cleanup:
            self._chart_cleanup(
                prefix,
                self.manifest[const.KEYWORD_ARMADA][const.KEYWORD_GROUPS], msg)

        LOG.info('Done applying manifest.')
        return msg

    def post_flight_ops(self):
        '''
        Operations to run after deployment process has terminated
        '''
        LOG.info("Performing post-flight operations.")

        # Delete temp dirs used for deployment
        for cloned_dir in self.cloned_dirs:
            LOG.debug('Removing cloned temp directory: %s', cloned_dir)
            source.source_cleanup(cloned_dir)

    def _wait_until_ready(self, release_name, wait_labels, namespace, timeout):
        if self.dry_run:
            LOG.info(
                'Skipping wait during `dry-run`, would have waited on '
                'namespace=%s, labels=(%s) for %ss.', namespace, wait_labels,
                timeout)
            return

        LOG.info('Waiting for release=%s', release_name)

        waits = [
            get_wait_for('job', self.tiller.k8s, skip_if_none_found=True),
            get_wait_for('pod', self.tiller.k8s)
        ]
        deadline = time.time() + timeout
        deadline_remaining = timeout
        for wait in waits:
            if deadline_remaining <= 0:
                reason = (
                    'Timeout expired waiting for release=%s' % release_name)
                LOG.error(reason)
                raise armada_exceptions.ArmadaTimeoutException(reason)

            wait.wait(
                labels=wait_labels,
                namespace=namespace,
                k8s_wait_attempts=self.k8s_wait_attempts,
                k8s_wait_attempt_sleep=self.k8s_wait_attempt_sleep,
                timeout=timeout)
            deadline_remaining = int(round(deadline - time.time()))

    def _test_chart(self, release_name, timeout, cleanup):
        if self.dry_run:
            LOG.info(
                'Skipping test during `dry-run`, would have tested '
                'release=%s with timeout %ss.', release_name, timeout)
            return True

        if timeout <= 0:
            reason = ('Timeout expired before testing '
                      'release %s' % release_name)
            LOG.error(reason)
            raise armada_exceptions.ArmadaTimeoutException(reason)

        success = test_release_for_success(
            self.tiller, release_name, timeout=timeout, cleanup=cleanup)
        if success:
            LOG.info("Test passed for release: %s", release_name)
        else:
            LOG.info("Test failed for release: %s", release_name)
            raise tiller_exceptions.TestFailedException(release_name)

    def _chart_cleanup(self, prefix, charts, msg):
        LOG.info('Processing chart cleanup to remove unspecified releases.')

        valid_releases = []
        for gchart in charts:
            for chart in gchart.get(const.KEYWORD_CHARTS, []):
                valid_releases.append(
                    release_prefixer(prefix,
                                     chart.get('chart', {}).get('release')))

        actual_releases = [x.name for x in self.tiller.list_releases()]
        release_diff = list(set(actual_releases) - set(valid_releases))

        for release in release_diff:
            if release.startswith(prefix):
                LOG.info('Purging release %s as part of chart cleanup.',
                         release)
                self.tiller.uninstall_release(release)
                msg['purge'].append(release)

    def get_diff(self, old_chart, old_values, new_chart, values):
        return ReleaseDiff(old_chart, old_values, new_chart, values).get_diff()