From 0721ed43aa17e913bd6603d26ca7249a0abfc0b1 Mon Sep 17 00:00:00 2001
From: Sean Eagan <sean.eagan@att.com>
Date: Wed, 26 Jun 2019 13:13:13 -0500
Subject: [PATCH] Implement Prometheus metric integration

This implements Prometheus metric integration, including metric
definition, collection, and exportation.

End user documentation for supported metric data and exportation
interface is included.

Change-Id: Ia0837f28073d6cd8e0220ac84cdd261b32704ae4
---
 README.rst                                  |  11 +-
 armada/api/controller/metrics.py            |  38 +++++
 armada/api/server.py                        |   6 +-
 armada/cli/apply.py                         |  37 +++--
 armada/handlers/armada.py                   |  27 ++-
 armada/handlers/chart_deploy.py             | 175 ++++++++++++--------
 armada/handlers/metrics.py                  | 175 ++++++++++++++++++++
 armada/tests/unit/handlers/test_armada.py   |   1 +
 charts/armada/templates/deployment-api.yaml |   8 +-
 charts/armada/values.yaml                   |   7 +
 doc/source/commands/apply.rst               |  26 +--
 doc/source/operations/index.rst             |   1 +
 doc/source/operations/metrics.rst           |  85 ++++++++++
 entrypoint.sh                               |   3 +
 requirements.txt                            |   1 +
 15 files changed, 498 insertions(+), 103 deletions(-)
 create mode 100644 armada/api/controller/metrics.py
 create mode 100644 armada/handlers/metrics.py
 create mode 100644 doc/source/operations/metrics.rst

diff --git a/README.rst b/README.rst
index a4749cb3..3d3f94d6 100644
--- a/README.rst
+++ b/README.rst
@@ -90,13 +90,16 @@ Which should output something like this::
 For more information on how to install and use Armada, please reference:
 `Armada Quickstart`_.
 
+
 Integration Points
 ------------------
 
 Armada CLI component has the following integration points:
 
   * `Tiller`_ manages Armada chart installations.
-  * `Deckhand`_ supplies storage and management of site designs and secrets.
+  * `Deckhand`_ is one of the supported control document sources for Armada.
+  * `Prometheus`_ exporter is provided for metric data related to application
+    of charts and collections of charts. See `metrics`_.
 
 In addition, Armada's API component has the following integration points:
 
@@ -110,10 +113,12 @@ Further Reading
 
 .. _Manual Install Guide: https://airship-armada.readthedocs.io/en/latest/development/getting-started.html#developer-install-guide
 .. _Armada Quickstart: https://airship-armada.readthedocs.io/en/latest/operations/guide-use-armada.html
+.. _metrics: https://airship-armada.readthedocs.io/en/latest/operations/metrics.html#metrics
 .. _kubectl: https://kubernetes.io/docs/user-guide/kubectl/kubectl_config/
 .. _Tiller: https://docs.helm.sh/using_helm/#easy-in-cluster-installation
-.. _Deckhand: https://opendev.org/airship/deckhand
-.. _Keystone: https://opendev.org/openstack/keystone
+.. _Deckhand: https://github.com/openstack/airship-deckhand
+.. _Prometheus: https://prometheus.io
+.. _Keystone: https://github.com/openstack/keystone
 
 .. |Docker Repository on Quay| image:: https://quay.io/repository/airshipit/armada/status
    :target: https://quay.io/repository/airshipit/armada
diff --git a/armada/api/controller/metrics.py b/armada/api/controller/metrics.py
new file mode 100644
index 00000000..0ad7f95b
--- /dev/null
+++ b/armada/api/controller/metrics.py
@@ -0,0 +1,38 @@
+# Copyright 2019 The Armada Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import falcon
+import prometheus_client
+
+from armada import api
+from armada.handlers import metrics
+
+
+class Metrics(api.BaseResource):
+    '''Controller for exporting prometheus metrics.
+    '''
+
+    def on_get(self, req, resp):
+        encoder, content_type = prometheus_client.exposition.choose_encoder(
+            req.get_header('Accept'))
+        try:
+            output = encoder(metrics.REGISTRY)
+        except Exception as ex:
+            err_message = 'Failed to generate metric output'
+            self.logger.error(err_message, exc_info=ex)
+            return self.return_error(
+                resp, falcon.HTTP_500, message=err_message)
+        resp.content_type = content_type
+        resp.body = output
+        resp.status = falcon.HTTP_200
diff --git a/armada/api/server.py b/armada/api/server.py
index fd58dfb2..ccc48149 100644
--- a/armada/api/server.py
+++ b/armada/api/server.py
@@ -27,6 +27,7 @@ from armada.api.controller.rollback import Rollback
 from armada.api.controller.test import TestReleasesReleaseNameController
 from armada.api.controller.test import TestReleasesManifestController
 from armada.api.controller.health import Health
+from armada.api.controller.metrics import Metrics
 from armada.api.controller.tiller import Release
 from armada.api.controller.tiller import Status
 from armada.api.controller.validation import Validate
@@ -59,7 +60,7 @@ def create(enable_middleware=CONF.middleware):
     logging.setup(CONF, 'armada')
 
     # Configure API routing
-    url_routes_v1 = (
+    url_routes_v1 = [
         (HEALTH_PATH, Health()),
         ('apply', Apply()),
         ('releases', Release()),
@@ -68,7 +69,8 @@ def create(enable_middleware=CONF.middleware):
         ('tests', TestReleasesManifestController()),
         ('test/{release}', TestReleasesReleaseNameController()),
         ('validatedesign', Validate()),
-    )
+        ('metrics', Metrics()),
+    ]
 
     for route, service in url_routes_v1:
         api.add_route("/api/v1.0/{}".format(route), service)
diff --git a/armada/cli/apply.py b/armada/cli/apply.py
index b1da5129..67a9a2be 100644
--- a/armada/cli/apply.py
+++ b/armada/cli/apply.py
@@ -14,10 +14,12 @@
 
 import click
 from oslo_config import cfg
+import prometheus_client
 import yaml
 
 from armada.cli import CliAction
 from armada.exceptions.source_exceptions import InvalidPathException
+from armada.handlers import metrics
 from armada.handlers.armada import Armada
 from armada.handlers.document import ReferenceResolver
 from armada.handlers.lock import lock_and_thread
@@ -81,6 +83,12 @@ SHORT_DESC = "Command installs manifest charts."
     '--dry-run', help="Run charts without installing them.", is_flag=True)
 @click.option(
     '--enable-chart-cleanup', help="Clean up unmanaged charts.", is_flag=True)
+@click.option(
+    '--metrics-output',
+    help=(
+        "Output path for prometheus metric data, should end in .prom. By "
+        "default, no metric data is output."),
+    default=None)
 @click.option(
     '--use-doc-ref', help="Use armada manifest file reference.", is_flag=True)
 @click.option(
@@ -121,7 +129,7 @@ SHORT_DESC = "Command installs manifest charts."
     '--wait',
     help=(
         "Force Tiller to wait until all charts are deployed, "
-        "rather than using each chart's specified wait policy. "
+        "rather than using each charts specified wait policy. "
         "This is equivalent to sequenced chartgroups."),
     is_flag=True)
 @click.option(
@@ -135,22 +143,22 @@ SHORT_DESC = "Command installs manifest charts."
 @click.pass_context
 def apply_create(
         ctx, locations, api, disable_update_post, disable_update_pre, dry_run,
-        enable_chart_cleanup, use_doc_ref, set, tiller_host, tiller_port,
-        tiller_namespace, timeout, values, wait, target_manifest, bearer_token,
-        debug):
+        enable_chart_cleanup, metrics_output, use_doc_ref, set, tiller_host,
+        tiller_port, tiller_namespace, timeout, values, wait, target_manifest,
+        bearer_token, debug):
     CONF.debug = debug
     ApplyManifest(
         ctx, locations, api, disable_update_post, disable_update_pre, dry_run,
-        enable_chart_cleanup, use_doc_ref, set, tiller_host, tiller_port,
-        tiller_namespace, timeout, values, wait, target_manifest,
+        enable_chart_cleanup, metrics_output, use_doc_ref, set, tiller_host,
+        tiller_port, tiller_namespace, timeout, values, wait, target_manifest,
         bearer_token).safe_invoke()
 
 
 class ApplyManifest(CliAction):
     def __init__(
             self, ctx, locations, api, disable_update_post, disable_update_pre,
-            dry_run, enable_chart_cleanup, use_doc_ref, set, tiller_host,
-            tiller_port, tiller_namespace, timeout, values, wait,
+            dry_run, enable_chart_cleanup, metrics_output, use_doc_ref, set,
+            tiller_host, tiller_port, tiller_namespace, timeout, values, wait,
             target_manifest, bearer_token):
         super(ApplyManifest, self).__init__()
         self.ctx = ctx
@@ -161,6 +169,7 @@ class ApplyManifest(CliAction):
         self.disable_update_pre = disable_update_pre
         self.dry_run = dry_run
         self.enable_chart_cleanup = enable_chart_cleanup
+        self.metrics_output = metrics_output
         self.use_doc_ref = use_doc_ref
         self.set = set
         self.tiller_host = tiller_host
@@ -210,8 +219,16 @@ class ApplyManifest(CliAction):
                         bearer_token=self.bearer_token,
                         dry_run=self.dry_run) as tiller:
 
-                resp = self.handle(documents, tiller)
-                self.output(resp)
+                try:
+                    resp = self.handle(documents, tiller)
+                    self.output(resp)
+                finally:
+                    if self.metrics_output:
+                        path = self.metrics_output
+                        self.logger.info(
+                            'Storing metrics output in path: {}'.format(path))
+                        prometheus_client.write_to_textfile(
+                            path, metrics.REGISTRY)
         else:
             if len(self.values) > 0:
                 self.logger.error(
diff --git a/armada/handlers/armada.py b/armada/handlers/armada.py
index 413f052a..a5ba5e51 100644
--- a/armada/handlers/armada.py
+++ b/armada/handlers/armada.py
@@ -24,6 +24,7 @@ from armada.exceptions import override_exceptions
 from armada.exceptions import source_exceptions
 from armada.exceptions import tiller_exceptions
 from armada.exceptions import validate_exceptions
+from armada.handlers import metrics
 from armada.handlers.chart_deploy import ChartDeploy
 from armada.handlers.manifest import Manifest
 from armada.handlers.override import Override
@@ -92,8 +93,9 @@ class Armada(object):
             self.documents, target_manifest=target_manifest).get_manifest()
         self.chart_cache = {}
         self.chart_deploy = ChartDeploy(
-            disable_update_pre, disable_update_post, self.dry_run,
-            k8s_wait_attempts, k8s_wait_attempt_sleep, timeout, self.tiller)
+            self.manifest, disable_update_pre, disable_update_post,
+            self.dry_run, k8s_wait_attempts, k8s_wait_attempt_sleep, timeout,
+            self.tiller)
 
     def pre_flight_ops(self):
         """Perform a series of checks and operations to ensure proper
@@ -113,6 +115,12 @@ class Armada(object):
                 self.get_chart(ch)
 
     def get_chart(self, ch):
+        manifest_name = self.manifest['metadata']['name']
+        chart_name = ch['metadata']['name']
+        with metrics.CHART_DOWNLOAD.get_context(manifest_name, chart_name):
+            return self._get_chart(ch)
+
+    def _get_chart(self, ch):
         chart = ch.get(const.KEYWORD_DATA)
         chart_source = chart.get('source', {})
         location = chart_source.get('location')
@@ -171,6 +179,11 @@ class Armada(object):
         '''
         Synchronize Helm with the Armada Config(s)
         '''
+        manifest_name = self.manifest['metadata']['name']
+        with metrics.APPLY.get_context(manifest_name):
+            return self._sync()
+
+    def _sync(self):
         if self.dry_run:
             LOG.info('Armada is in DRY RUN mode, no changes being made.')
 
@@ -207,11 +220,12 @@ class Armada(object):
 
             cg_charts = chartgroup.get(const.KEYWORD_CHARTS, [])
 
-            def deploy_chart(chart):
+            def deploy_chart(chart, concurrency):
                 set_current_chart(chart)
                 try:
                     return self.chart_deploy.execute(
-                        chart, cg_test_all_charts, prefix, known_releases)
+                        chart, cg_test_all_charts, prefix, known_releases,
+                        concurrency)
                 finally:
                     set_current_chart(None)
 
@@ -233,13 +247,14 @@ class Armada(object):
 
             if cg_sequenced:
                 for chart in cg_charts:
-                    if (handle_result(chart, lambda: deploy_chart(chart))):
+                    if (handle_result(chart, lambda: deploy_chart(chart, 1))):
                         break
             else:
                 with ThreadPoolExecutor(
                         max_workers=len(cg_charts)) as executor:
                     future_to_chart = {
-                        executor.submit(deploy_chart, chart): chart
+                        executor.submit(deploy_chart, chart, len(cg_charts)):
+                        chart
                         for chart in cg_charts
                     }
 
diff --git a/armada/handlers/chart_deploy.py b/armada/handlers/chart_deploy.py
index 0f625ca4..5317d129 100644
--- a/armada/handlers/chart_deploy.py
+++ b/armada/handlers/chart_deploy.py
@@ -19,6 +19,7 @@ import yaml
 
 from armada import const
 from armada.exceptions import armada_exceptions
+from armada.handlers import metrics
 from armada.handlers.chartbuilder import ChartBuilder
 from armada.handlers.release_diff import ReleaseDiff
 from armada.handlers.chart_delete import ChartDelete
@@ -33,8 +34,9 @@ LOG = logging.getLogger(__name__)
 
 class ChartDeploy(object):
     def __init__(
-            self, disable_update_pre, disable_update_post, dry_run,
+            self, manifest, disable_update_pre, disable_update_post, dry_run,
             k8s_wait_attempts, k8s_wait_attempt_sleep, timeout, tiller):
+        self.manifest = manifest
         self.disable_update_pre = disable_update_pre
         self.disable_update_post = disable_update_post
         self.dry_run = dry_run
@@ -43,25 +45,26 @@ class ChartDeploy(object):
         self.timeout = timeout
         self.tiller = tiller
 
-    def execute(self, ch, cg_test_all_charts, prefix, known_releases):
+    def execute(
+            self, ch, cg_test_all_charts, prefix, known_releases, concurrency):
+        chart_name = ch['metadata']['name']
+        manifest_name = self.manifest['metadata']['name']
+        with metrics.CHART_HANDLE.get_context(concurrency, manifest_name,
+                                              chart_name):
+            return self._execute(
+                ch, cg_test_all_charts, prefix, known_releases)
+
+    def _execute(self, ch, cg_test_all_charts, prefix, known_releases):
+        manifest_name = self.manifest['metadata']['name']
         chart = ch[const.KEYWORD_DATA]
+        chart_name = ch['metadata']['name']
         namespace = chart.get('namespace')
         release = chart.get('release')
         release_name = r.release_prefixer(prefix, release)
         LOG.info('Processing Chart, release=%s', release_name)
 
-        values = chart.get('values', {})
-        pre_actions = {}
-        post_actions = {}
-
         result = {}
 
-        old_release = self.find_chart_release(known_releases, release_name)
-
-        status = None
-        if old_release:
-            status = r.get_release_status(old_release)
-
         chart_wait = ChartWait(
             self.tiller.k8s,
             release_name,
@@ -70,18 +73,32 @@ class ChartDeploy(object):
             k8s_wait_attempts=self.k8s_wait_attempts,
             k8s_wait_attempt_sleep=self.k8s_wait_attempt_sleep,
             timeout=self.timeout)
-
-        native_wait_enabled = chart_wait.is_native_enabled()
+        wait_timeout = chart_wait.get_timeout()
 
         # Begin Chart timeout deadline
-        deadline = time.time() + chart_wait.get_timeout()
+        deadline = time.time() + wait_timeout
+        old_release = self.find_chart_release(known_releases, release_name)
+        action = metrics.ChartDeployAction.NOOP
+
+        def noop():
+            pass
+
+        deploy = noop
+
+        # Resolve action
+        values = chart.get('values', {})
+        pre_actions = {}
+        post_actions = {}
+
+        status = None
+        if old_release:
+            status = r.get_release_status(old_release)
+
+        native_wait_enabled = chart_wait.is_native_enabled()
 
         chartbuilder = ChartBuilder(ch)
         new_chart = chartbuilder.get_helm_chart()
 
-        # TODO(mark-burnett): It may be more robust to directly call
-        # tiller status to decide whether to install/upgrade rather
-        # than checking for list membership.
         if status == const.STATUS_DEPLOYED:
 
             # indicate to the end user what path we are taking
@@ -135,36 +152,37 @@ class ChartDeploy(object):
             if not diff:
                 LOG.info("Found no updates to chart release inputs")
             else:
+                action = metrics.ChartDeployAction.UPGRADE
                 LOG.info("Found updates to chart release inputs")
                 LOG.debug("%s", diff)
                 result['diff'] = {chart['release']: str(diff)}
 
-                # TODO(MarshM): Add tiller dry-run before upgrade and
-                # consider deadline impacts
+                def upgrade():
+                    # do actual update
+                    timer = int(round(deadline - time.time()))
+                    LOG.info(
+                        "Upgrading release %s in namespace %s, wait=%s, "
+                        "timeout=%ss", release_name, namespace,
+                        native_wait_enabled, timer)
+                    tiller_result = self.tiller.update_release(
+                        new_chart,
+                        release_name,
+                        namespace,
+                        pre_actions=pre_actions,
+                        post_actions=post_actions,
+                        disable_hooks=disable_hooks,
+                        values=yaml.safe_dump(values),
+                        wait=native_wait_enabled,
+                        timeout=timer,
+                        force=force,
+                        recreate_pods=recreate_pods)
 
-                # do actual update
-                timer = int(round(deadline - time.time()))
-                LOG.info(
-                    "Upgrading release %s in namespace %s, wait=%s, "
-                    "timeout=%ss", release_name, namespace,
-                    native_wait_enabled, timer)
-                tiller_result = self.tiller.update_release(
-                    new_chart,
-                    release_name,
-                    namespace,
-                    pre_actions=pre_actions,
-                    post_actions=post_actions,
-                    disable_hooks=disable_hooks,
-                    values=yaml.safe_dump(values),
-                    wait=native_wait_enabled,
-                    timeout=timer,
-                    force=force,
-                    recreate_pods=recreate_pods)
+                    LOG.info(
+                        'Upgrade completed with results from Tiller: %s',
+                        tiller_result.__dict__)
+                    result['upgrade'] = release_name
 
-                LOG.info(
-                    'Upgrade completed with results from Tiller: %s',
-                    tiller_result.__dict__)
-                result['upgrade'] = release_name
+                deploy = upgrade
         else:
             # Check for release with status other than DEPLOYED
             if status:
@@ -178,7 +196,6 @@ class ChartDeploy(object):
                     # was started within the timeout window of the chart.
                     last_deployment_age = r.get_last_deployment_age(
                         old_release)
-                    wait_timeout = chart_wait.get_timeout()
                     likely_pending = last_deployment_age <= wait_timeout
                     if likely_pending:
                         # Give up if a deployment is likely pending, we do not
@@ -217,35 +234,49 @@ class ChartDeploy(object):
                             release_name, status)
                 else:
                     # Purge the release
-                    LOG.info(
-                        'Purging release %s with status %s', release_name,
-                        status)
-                    chart_delete = ChartDelete(
-                        chart, release_name, self.tiller)
-                    chart_delete.delete()
-                    result['purge'] = release_name
+                    with metrics.CHART_DELETE.get_context(manifest_name,
+                                                          chart_name):
 
+                        LOG.info(
+                            'Purging release %s with status %s', release_name,
+                            status)
+                        chart_delete = ChartDelete(
+                            chart, release_name, self.tiller)
+                        chart_delete.delete()
+                        result['purge'] = release_name
+
+            action = metrics.ChartDeployAction.INSTALL
+
+            def install():
+                timer = int(round(deadline - time.time()))
+                LOG.info(
+                    "Installing release %s in namespace %s, wait=%s, "
+                    "timeout=%ss", release_name, namespace,
+                    native_wait_enabled, timer)
+                tiller_result = self.tiller.install_release(
+                    new_chart,
+                    release_name,
+                    namespace,
+                    values=yaml.safe_dump(values),
+                    wait=native_wait_enabled,
+                    timeout=timer)
+
+                LOG.info(
+                    'Install completed with results from Tiller: %s',
+                    tiller_result.__dict__)
+                result['install'] = release_name
+
+            deploy = install
+
+        # Deploy
+        with metrics.CHART_DEPLOY.get_context(wait_timeout, manifest_name,
+                                              chart_name,
+                                              action.get_label_value()):
+            deploy()
+
+            # Wait
             timer = int(round(deadline - time.time()))
-            LOG.info(
-                "Installing release %s in namespace %s, wait=%s, "
-                "timeout=%ss", release_name, namespace, native_wait_enabled,
-                timer)
-            tiller_result = self.tiller.install_release(
-                new_chart,
-                release_name,
-                namespace,
-                values=yaml.safe_dump(values),
-                wait=native_wait_enabled,
-                timeout=timer)
-
-            LOG.info(
-                'Install completed with results from Tiller: %s',
-                tiller_result.__dict__)
-            result['install'] = release_name
-
-        # Wait
-        timer = int(round(deadline - time.time()))
-        chart_wait.wait(timer)
+            chart_wait.wait(timer)
 
         # Test
         just_deployed = ('install' in result) or ('upgrade' in result)
@@ -260,7 +291,9 @@ class ChartDeploy(object):
         run_test = test_handler.test_enabled and (
             just_deployed or not last_test_passed)
         if run_test:
-            self._test_chart(release_name, test_handler)
+            with metrics.CHART_TEST.get_context(test_handler.timeout,
+                                                manifest_name, chart_name):
+                self._test_chart(release_name, test_handler)
 
         return result
 
diff --git a/armada/handlers/metrics.py b/armada/handlers/metrics.py
new file mode 100644
index 00000000..e344f9ee
--- /dev/null
+++ b/armada/handlers/metrics.py
@@ -0,0 +1,175 @@
+# Copyright 2019 The Armada Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from contextlib import ExitStack
+from enum import Enum
+import os
+
+import prometheus_client
+from prometheus_client import multiprocess, values, context_managers
+
+
+class ActionMetrics():
+    """ Support for defining and observing metrics for an action, including
+    tracking attempts, failures, and timing.
+    """
+
+    _PREFIX = 'armada'
+
+    def __init__(self, prefix, description, labels):
+        """
+        :param prefix: prefix to use for each metric name
+        :param description: description of action to use in metric description
+        :param labels: label names to define for each metric
+        """
+        self.full_prefix = '{}_{}'.format(self.__class__._PREFIX, prefix)
+        self.progress = prometheus_client.Gauge(
+            '{}_attempt_inprogress'.format(self.full_prefix),
+            'In progress attempts to {}'.format(description),
+            labels,
+            registry=REGISTRY,
+            multiprocess_mode='livesum')
+        self.attempt_total = prometheus_client.Counter(
+            '{}_attempt_total'.format(self.full_prefix),
+            'Total attempts to {}'.format(description),
+            labels,
+            registry=REGISTRY)
+        self.failure_total = prometheus_client.Counter(
+            '{}_failure_total'.format(self.full_prefix),
+            'Total failures to {}'.format(description),
+            labels,
+            registry=REGISTRY)
+        self.duration = prometheus_client.Histogram(
+            '{}_duration_seconds'.format(self.full_prefix),
+            'Seconds to {}'.format(description),
+            labels,
+            registry=REGISTRY)
+
+    def get_context(self, *args, **kwargs):
+        """ Any extra args are used as metric label values.
+
+        :return: a context manager for the action which observes the desired
+        metrics.
+        :rtype: contextmanager
+        """
+        progress = self.progress.labels(*args, **kwargs)
+        attempt_total = self.attempt_total.labels(*args, **kwargs)
+        attempt_total.inc()
+        failure_total = self.failure_total.labels(*args, **kwargs)
+        duration = self.duration.labels(*args, **kwargs)
+
+        e = ExitStack()
+        contexts = [
+            progress.track_inprogress(),
+            failure_total.count_exceptions(),
+            duration.time()
+        ]
+        for ctx in contexts:
+            e.enter_context(ctx)
+        return e
+
+
+class ChartHandleMetrics(ActionMetrics):
+    def __init__(self, prefix, description, labels):
+        super().__init__(prefix, description, labels)
+        self.concurrency = prometheus_client.Histogram(
+            '{}_concurrency_count'.format(self.full_prefix),
+            'Count of charts being handled concurrently for chart',
+            labels,
+            registry=REGISTRY)
+
+    def get_context(self, concurrency_value, *args, **kwargs):
+        concurrency = self.concurrency.labels(*args, **kwargs)
+        concurrency.observe(concurrency_value)
+        return super().get_context(*args, **kwargs)
+
+
+class ActionWithTimeoutMetrics(ActionMetrics):
+    def __init__(self, prefix, description, labels):
+        super().__init__(prefix, description, labels)
+        self.timeout = prometheus_client.Histogram(
+            '{}_timeout_duration_seconds'.format(self.full_prefix),
+            'Configured timeout (in seconds) to {}'.format(description),
+            labels,
+            registry=REGISTRY)
+        self.timeout_usage = prometheus_client.Histogram(
+            '{}_timeout_usage_ratio'.format(self.full_prefix),
+            'Ratio of duration to timeout to {}'.format(description),
+            labels,
+            registry=REGISTRY)
+
+    def get_context(self, timeout_value, *args, **kwargs):
+        timeout = self.timeout.labels(*args, **kwargs)
+        timeout_usage = self.timeout_usage.labels(*args, **kwargs)
+
+        timeout.observe(timeout_value)
+
+        def observe_timeout_usage(duration):
+            # Avoid division by 0
+            if timeout_value:
+                val = duration / timeout_value
+                timeout_usage.observe(val)
+
+        timer = context_managers.Timer(observe_timeout_usage)
+        context = super().get_context(*args, **kwargs)
+        context.enter_context(timer)
+        return context
+
+
+class ChartDeployAction(Enum):
+    """ Enum to define sub-actions for the chart deploy action, to be used as
+    label values.
+    """
+
+    INSTALL = 1
+    UPGRADE = 2
+    NOOP = 3
+
+    def get_label_value(self):
+        """
+        :return: the label value
+        :rtype: str
+        """
+        return self.name.lower()
+
+
+REGISTRY = prometheus_client.CollectorRegistry()
+
+if "prometheus_multiproc_dir" in os.environ:
+    # For why this is needed see:
+    #   https://github.com/prometheus/client_python/issues/275#issuecomment-504755024
+    import uwsgi
+    prometheus_client.values.ValueClass = values.MultiProcessValue(
+        uwsgi.worker_id)
+
+    multiprocess.MultiProcessCollector(REGISTRY)
+
+APPLY = ActionMetrics('apply', 'apply a manifest', ['manifest'])
+# TODO: Ideally include an action (ChartDeployAction) label, but that's not
+# determined until after chart handling starts.
+CHART_HANDLE = ChartHandleMetrics(
+    'chart_handle',
+    'handle a chart (including delete, deploy, test (all as necessary) but '
+    'not download)', ['manifest', 'chart'])
+CHART_DOWNLOAD = ActionMetrics(
+    'chart_download', 'download a chart (will be noop if previously cached)',
+    ['manifest', 'chart'])
+CHART_DELETE = ActionMetrics(
+    'chart_delete', 'delete a chart', ['manifest', 'chart'])
+CHART_DEPLOY = ActionWithTimeoutMetrics(
+    'chart_deploy',
+    'deploy a chart (including install/upgrade and wait (all as necessary))',
+    ['manifest', 'chart', 'action'])
+CHART_TEST = ActionWithTimeoutMetrics(
+    'chart_test', 'test a chart', ['manifest', 'chart'])
diff --git a/armada/tests/unit/handlers/test_armada.py b/armada/tests/unit/handlers/test_armada.py
index 7d5f79d3..7f7ed7be 100644
--- a/armada/tests/unit/handlers/test_armada.py
+++ b/armada/tests/unit/handlers/test_armada.py
@@ -397,6 +397,7 @@ class ArmadaHandlerTestCase(base.ArmadaTestCase):
                 mock_test_release.side_effect = fail
             else:
                 mock_test_release.return_value = test_success
+            mock_test.return_value.timeout = const.DEFAULT_TEST_TIMEOUT
 
             # Stub out irrelevant methods called by `armada.sync()`.
             mock_chartbuilder.get_source_path.return_value = None
diff --git a/charts/armada/templates/deployment-api.yaml b/charts/armada/templates/deployment-api.yaml
index 750614aa..ddc9ce96 100644
--- a/charts/armada/templates/deployment-api.yaml
+++ b/charts/armada/templates/deployment-api.yaml
@@ -18,6 +18,7 @@ limitations under the License.
 {{- $envAll := . }}
 {{- $mounts_armada_api := .Values.pod.mounts.armada_api.armada_api }}
 {{- $mounts_armada_api_init := .Values.pod.mounts.armada_api.init_container }}
+{{- $prometheus_annotations := $envAll.Values.monitoring.prometheus.armada }}
 {{- $serviceAccountName := "armada-api" }}
 {{ tuple $envAll "api" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }}
 ---
@@ -79,9 +80,9 @@ spec:
       labels:
 {{ tuple $envAll "armada" "api" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 8 }}
       annotations:
-{{ tuple $envAll | include "helm-toolkit.snippets.release_uuid" | indent 8 }}
         configmap-bin-hash: {{ tuple "configmap-bin.yaml" . | include "helm-toolkit.utils.hash" }}
         configmap-etc-hash: {{ tuple "configmap-etc.yaml" . | include "helm-toolkit.utils.hash" }}
+{{ tuple $prometheus_annotations | include "helm-toolkit.snippets.prometheus_pod_annotations" | indent 8 }}
     spec:
 {{ dict "envAll" $envAll "application" "armada" | include "helm-toolkit.snippets.kubernetes_pod_security_context" | indent 6 }}
       serviceAccountName: {{ $serviceAccountName }}
@@ -123,6 +124,8 @@ spec:
           volumeMounts:
             - name: pod-tmp
               mountPath: /tmp
+            - name: pod-tmp-metrics
+              mountPath: /tmp/armada/metrics
             - name: pod-etc-armada
               mountPath: /etc/armada
             - name: armada-etc
@@ -193,6 +196,9 @@ spec:
       volumes:
         - name: pod-tmp
           emptyDir: {}
+        - name: pod-tmp-metrics
+          emptyDir:
+            medium: Memory
         - name: pod-etc-armada
           emptyDir: {}
         - name: armada-bin
diff --git a/charts/armada/values.yaml b/charts/armada/values.yaml
index 099271d6..e5147a52 100644
--- a/charts/armada/values.yaml
+++ b/charts/armada/values.yaml
@@ -216,6 +216,13 @@ conf:
     # greater than that will have no effect.
     prestop_sleep: 30
 
+monitoring:
+  prometheus:
+    armada:
+      scrape: true
+      path: /api/v1.0/metrics
+      port: 8000
+
 pod:
   security_context:
     armada:
diff --git a/doc/source/commands/apply.rst b/doc/source/commands/apply.rst
index 4e713b75..e182e629 100644
--- a/doc/source/commands/apply.rst
+++ b/doc/source/commands/apply.rst
@@ -7,29 +7,29 @@ Commands
 
 .. code:: bash
 
-    Usage: armada apply [OPTIONS] FILENAME
+    Usage: armada apply [OPTIONS] [LOCATIONS]...
 
-      This command installs and updates charts defined in armada manifest
+      This command installs and updates charts defined in Armada manifest.
 
       The apply argument must be relative path to Armada Manifest. Executing
       apply command once will install all charts defined in manifest. Re-
       executing apply command will execute upgrade.
 
-      To see how to create an Armada manifest:
-      https://airship-armada.readthedocs.io/en/latest/operations/
+      To see how to create an Armada manifest:     https://airship-
+      armada.readthedocs.io/en/latest/operations/
 
       To install or upgrade charts, run:
 
-          $ armada apply examples/simple.yaml
+              $ armada apply examples/simple.yaml
 
       To override a specific value in a Manifest, run:
 
-          $ armada apply examples/simple.yaml --set manifest:simple-armada:release="wordpress"
+              $ armada apply examples/simple.yaml --set manifest:simple-armada:release="wordpress"
 
       Or to override several values in a Manifest, reference a values.yaml-
       formatted file:
 
-          $ armada apply examples/simple.yaml --values examples/simple-ovr-values.yaml
+              $ armada apply examples/simple.yaml --values examples/simple-ovr-values.yaml
 
     Options:
       --api                         Contacts service endpoint.
@@ -37,6 +37,8 @@ Commands
       --disable-update-pre          Disable pre-update Tiller operations.
       --dry-run                     Run charts without installing them.
       --enable-chart-cleanup        Clean up unmanaged charts.
+      --metrics-output TEXT         The output path for metric data
+      --use-doc-ref                 Use armada manifest file reference.
       --set TEXT                    Use to override Armada Manifest values.
                                     Accepts overrides that adhere to the format
                                     <path>:<to>:<property>=<value> to specify a
@@ -46,15 +48,19 @@ Commands
       --tiller-host TEXT            Tiller host IP.
       --tiller-port INTEGER         Tiller host port.
       -tn, --tiller-namespace TEXT  Tiller namespace.
-      --timeout INTEGER             Specifies time to wait for charts to deploy.
+      --timeout INTEGER             Specifies time to wait for each chart to fully
+                                    finish deploying.
       -f, --values TEXT             Use to override multiple Armada Manifest
                                     values by reading overrides from a
                                     values.yaml-type file.
-      --wait                        Wait until all charts deployed.
+      --wait                        Force Tiller to wait until all charts are
+                                    deployed, rather than using each charts
+                                    specified wait policy. This is equivalent to
+                                    sequenced chartgroups.
       --target-manifest TEXT        The target manifest to run. Required for
                                     specifying which manifest to run when multiple
                                     are available.
-      --bearer-token                User bearer token.
+      --bearer-token TEXT           User Bearer token
       --debug                       Enable debug logging.
       --help                        Show this message and exit.
 
diff --git a/doc/source/operations/index.rst b/doc/source/operations/index.rst
index 590f6250..2f5e0ac5 100644
--- a/doc/source/operations/index.rst
+++ b/doc/source/operations/index.rst
@@ -14,6 +14,7 @@ Operations Guide
    guide-configure
    guide-troubleshooting
    guide-use-armada
+   metrics
    exceptions/index
    guide-helm-plugin
    sampleconf
diff --git a/doc/source/operations/metrics.rst b/doc/source/operations/metrics.rst
new file mode 100644
index 00000000..17b707ac
--- /dev/null
+++ b/doc/source/operations/metrics.rst
@@ -0,0 +1,85 @@
+.. _metrics:
+
+Metrics
+=======
+
+Armada exposes metric data, for consumption by `Prometheus`_.
+
+Exporting
+---------
+
+Metric data can be exported via:
+
+  * API: Prometheus exporter in the `/metrics` endpoint. The Armada chart
+    includes the appropriate Prometheus scrape configurations for this endpoint.
+  * CLI: `--metrics-output=<path>` of `apply` command. The
+    `node exporter text file collector`_ can then be used to export the produced
+    text files to Prometheus.
+
+Metric Names
+------------
+
+Metric names are as follows:
+
+`armada_` + <action> + `_` + <metric>
+
+Supported <action>s
+-------------------
+
+The below tree of <action>s are measured. Supported prometheus labels are noted.
+Labels are inherited by sub-actions except as noted.
+
+  * `apply`:
+
+    * description: apply a manifest
+    * labels: `manifest`
+    * sub-actions:
+
+      * `chart_handle`:
+
+        * description: fully handle a chart (see below sub-actions)
+        * labels:
+
+          * `chart`
+          * `action` (install|upgrade|noop) (not included in sub-actions)
+        * sub-actions:
+
+          * `chart_download`
+          * `chart_deploy`
+          * `chart_test`
+      * `chart_delete`:
+
+        * description: delete a chart (e.g. due to `FAILED` status)
+        * labels: `chart`
+
+Supported <metric>s
+-------------------
+
+  * `failure_total`: total failed attempts
+  * `attempt_total`: total attempts
+  * `attempt_inprogress`: total attempts in progress
+  * `duration_seconds`: duration of each attempt
+
+Timeouts
+^^^^^^^^
+
+The `chart_handle` and `chart_test` actions additionally include the following
+metrics:
+
+  * `timeout_duration_seconds`: configured chart timeout duration in seconds
+  * `timeout_usage_ratio`: `= duration_seconds / timeout_duration_seconds`
+
+These can help identify charts whose timeouts may need to
+be changed to avoid potential failures or to acheive faster failures.
+
+Chart concurrency
+^^^^^^^^^^^^^^^^^
+
+The `chart_handle` action additionally includes the following metric:
+
+  * `concurrency_count`: count of charts being handled concurrently
+
+This can help identify opportunities for greater chart concurrency.
+
+.. _Prometheus: https://prometheus.io
+.. _`node exporter text file collector`: https://github.com/prometheus/node_exporter#textfile-collector
diff --git a/entrypoint.sh b/entrypoint.sh
index 8b3b39f3..6a84f3f2 100755
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -26,6 +26,8 @@ ARMADA_UWSGI_TIMEOUT=${ARMADA_UWSGI_TIMEOUT:-3600}
 ARMADA_UWSGI_WORKERS=${ARMADA_UWSGI_WORKERS:-4}
 # Threads per worker
 ARMADA_UWSGI_THREADS=${ARMADA_UWSGI_THREADS:-1}
+# Prometheus multiprocess dir
+ARMADA_PROMETHEUS_MULTIPROC_DIR=${ARMADA_PROMETHEUS_MULTIPROC_DIR:-$(mktemp -d -p /tmp/armada/metrics XXXXXX)}
 
 # Start Armada application
 # TODO(fmontei): Should be specifying callable too. But Armada spins up the
@@ -37,6 +39,7 @@ if [ "$1" = 'server' ]; then
         --http :"${ARMADA_UWSGI_PORT}" \
         --http-timeout "$ARMADA_UWSGI_TIMEOUT" \
         --enable-threads \
+        --env prometheus_multiproc_dir="$ARMADA_PROMETHEUS_MULTIPROC_DIR" \
         -L \
         --lazy-apps \
         --master \
diff --git a/requirements.txt b/requirements.txt
index 9697878c..0e8051a8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,6 +10,7 @@ PasteDeploy>=1.5.2
 protobuf>=3.4.0
 PyYAML==3.12
 requests
+prometheus_client==0.7.0
 
 # API
 falcon