diff --git a/global/software/charts/osh-infra/osh-infra-dashboards/grafana.yaml b/global/software/charts/osh-infra/osh-infra-dashboards/grafana.yaml
index 48e36e23a..17dd92d06 100644
--- a/global/software/charts/osh-infra/osh-infra-dashboards/grafana.yaml
+++ b/global/software/charts/osh-infra/osh-infra-dashboards/grafana.yaml
@@ -68,6 +68,12 @@ metadata:
         path: .osh_infra.grafana.oslo_db
       dest:
         path: .values.endpoints.oslo_db.auth.user
+    - src:
+        schema: pegleg/AccountCatalogue/v1
+        name: osh_infra_service_accounts
+        path: .osh_infra.prometheus.admin
+      dest:
+        path: .values.endpoints.monitoring.auth.admin
     - src:
         schema: pegleg/AccountCatalogue/v1
         name: osh_infra_service_accounts
@@ -120,6 +126,12 @@ metadata:
         schema: deckhand/Passphrase/v1
         name: osh_infra_oslo_db_admin_password
         path: .
+    - dest:
+        path: .values.endpoints.monitoring.auth.admin.password
+      src:
+        schema: deckhand/Passphrase/v1
+        name: osh_infra_prometheus_admin_password
+        path: .
 
     # LDAP Configuration Details
     - src:
@@ -211,6 +223,10 @@ data:
         node_selector_key: openstack-control-plane
         node_selector_value: enabled
     conf:
+      provisioning:
+        datasources:
+          monitoring:
+            url: http://prom-metrics.osh-infra.svc.cluster.local:80/
       ldap:
         config:
           base_dns:
diff --git a/global/software/charts/osh-infra/osh-infra-mariadb/mariadb.yaml b/global/software/charts/osh-infra/osh-infra-mariadb/mariadb.yaml
index 310cfec33..1d90c3ccd 100644
--- a/global/software/charts/osh-infra/osh-infra-mariadb/mariadb.yaml
+++ b/global/software/charts/osh-infra/osh-infra-mariadb/mariadb.yaml
@@ -31,6 +31,13 @@ metadata:
         path: .osh_infra.oslo_db
       dest:
         path: .values.endpoints.olso_db
+    - src:
+        schema: pegleg/EndpointCatalogue/v1
+        name: osh_infra_endpoints
+        path: .osh_infra.prometheus_mysql_exporter
+      dest:
+        path: .values.endpoints.prometheus_mysql_exporter
+
     # Accounts
     - src:
         schema: pegleg/AccountCatalogue/v1
@@ -38,6 +45,12 @@ metadata:
         path: .osh_infra.oslo_db.admin
       dest:
         path: .values.endpoints.oslo_db.auth.admin
+    - src:
+        schema: pegleg/AccountCatalogue/v1
+        name: osh_infra_service_accounts
+        path: .osh_infra.prometheus_mysql_exporter.user
+      dest:
+        path: .values.endpoints.prometheus_mysql_exporter.auth.user
 
     # Secrets
     - dest:
@@ -46,7 +59,12 @@ metadata:
         schema: deckhand/Passphrase/v1
         name: osh_infra_oslo_db_admin_password
         path: .
-
+    - dest:
+        path: .values.endpoints.oslo_db.auth.exporter.password
+      src:
+        schema: deckhand/Passphrase/v1
+        name: osh_infra_oslo_db_exporter_password
+        path: .
 data:
   chart_name: osh-infra-mariadb
   release: osh-infra-mariadb
@@ -72,6 +90,9 @@ data:
       prometheus_mysql_exporter:
         node_selector_key: openstack-control-plane
         node_selector_value: enabled
+    monitoring:
+      prometheus:
+        enabled: true
   dependencies:
     - osh-helm-toolkit
 ...
diff --git a/global/software/charts/osh-infra/osh-infra-monitoring/chart-group.yaml b/global/software/charts/osh-infra/osh-infra-monitoring/chart-group.yaml
index 4cb879cd4..be06ca8ad 100644
--- a/global/software/charts/osh-infra/osh-infra-monitoring/chart-group.yaml
+++ b/global/software/charts/osh-infra/osh-infra-monitoring/chart-group.yaml
@@ -13,5 +13,6 @@ data:
     - prometheus
     - prometheus-alertmanager
     - prometheus-node-exporter
+    - prometheus-process-exporter
     - prometheus-kube-state-metrics
     - nagios
diff --git a/global/software/charts/osh-infra/osh-infra-monitoring/nagios.yaml b/global/software/charts/osh-infra/osh-infra-monitoring/nagios.yaml
index 35ff41b3e..ee7122e8e 100644
--- a/global/software/charts/osh-infra/osh-infra-monitoring/nagios.yaml
+++ b/global/software/charts/osh-infra/osh-infra-monitoring/nagios.yaml
@@ -37,6 +37,12 @@ metadata:
         path: .osh_infra.monitoring
       dest:
         path: .values.endpoints.monitoring
+    - src:
+        schema: pegleg/EndpointCatalogue/v1
+        name: osh_infra_endpoints
+        path: .osh_infra.elasticsearch
+      dest:
+        path: .values.endpoints.elasticsearch
     - src:
         schema: pegleg/EndpointCatalogue/v1
         name: osh_infra_endpoints
@@ -51,6 +57,18 @@ metadata:
         path: .osh_infra.nagios.admin
       dest:
         path: .values.endpoints.nagios.auth.admin
+    - src:
+        schema: pegleg/AccountCatalogue/v1
+        name: osh_infra_service_accounts
+        path: .osh_infra.prometheus.admin
+      dest:
+        path: .values.endpoints.monitoring.auth.admin
+    - src:
+        schema: pegleg/AccountCatalogue/v1
+        name: osh_infra_service_accounts
+        path: .osh_infra.elasticsearch.admin
+      dest:
+        path: .values.endpoints.elasticsearch.auth.admin
 
     # Secrets
     - dest:
@@ -59,6 +77,18 @@ metadata:
         schema: deckhand/Passphrase/v1
         name: osh_infra_nagios_admin_password
         path: .
+    - dest:
+        path: .values.endpoints.elasticsearch.auth.admin.password
+      src:
+        schema: deckhand/Passphrase/v1
+        name: osh_infra_elasticsearch_admin_password
+        path: .
+    - dest:
+        path: .values.endpoints.monitoring.auth.admin.password
+      src:
+        schema: deckhand/Passphrase/v1
+        name: osh_infra_prometheus_admin_password
+        path: .
 
     # LDAP Details
     - src:
diff --git a/global/software/charts/osh-infra/osh-infra-monitoring/prometheus-process-exporter.yaml b/global/software/charts/osh-infra/osh-infra-monitoring/prometheus-process-exporter.yaml
new file mode 100644
index 000000000..d64e8564e
--- /dev/null
+++ b/global/software/charts/osh-infra/osh-infra-monitoring/prometheus-process-exporter.yaml
@@ -0,0 +1,65 @@
+---
+schema: armada/Chart/v1
+metadata:
+  schema: metadata/Document/v1
+  name: prometheus-process-exporter
+  layeringDefinition:
+    abstract: false
+    layer: global
+  storagePolicy: cleartext
+  substitutions:
+    # Chart source
+    - src:
+        schema: pegleg/SoftwareVersions/v1
+        name: software-versions
+        path: .charts.osh_infra.prometheus_process_exporter
+      dest:
+        path: .source
+
+    # Images
+    - src:
+        schema: pegleg/SoftwareVersions/v1
+        name: software-versions
+        path: .images.osh_infra.prometheus_process_exporter
+      dest:
+        path: .values.images.tags
+
+    # Endpoints
+    - src:
+        schema: pegleg/EndpointCatalogue/v1
+        name: osh_infra_endpoints
+        path: .osh_infra.process_exporter_metrics
+      dest:
+        path: .values.endpoints.process_exporter_metrics
+
+data:
+  chart_name: prometheus-process-exporter
+  release: prometheus-process-exporter
+  namespace: kube-system
+  wait:
+    timeout: 900
+    labels:
+      release_group: airship-prometheus-process-exporter
+  install:
+    no_hooks: false
+  upgrade:
+    no_hooks: false
+    pre:
+      delete:
+        - type: job
+          labels:
+            release_group: airship-prometheus-process-exporter
+      create: []
+    post:
+      create: []
+  values:
+    labels:
+      node_exporter:
+        node_selector_key: node-exporter
+        node_selector_value: enabled
+      job:
+        node_selector_key: openstack-control-plane
+        node_selector_value: enabled
+  dependencies:
+    - osh-infra-helm-toolkit
+...
diff --git a/global/software/charts/osh-infra/osh-infra-monitoring/prometheus.yaml b/global/software/charts/osh-infra/osh-infra-monitoring/prometheus.yaml
index 9674e8897..bfde817a9 100644
--- a/global/software/charts/osh-infra/osh-infra-monitoring/prometheus.yaml
+++ b/global/software/charts/osh-infra/osh-infra-monitoring/prometheus.yaml
@@ -37,6 +37,42 @@ metadata:
         path: .osh_infra.alerts
       dest:
         path: .values.endpoints.alerts
+    - src:
+        schema: pegleg/EndpointCatalogue/v1
+        name: osh_infra_endpoints
+        path: .osh_infra.ldap
+      dest:
+        path: .values.endpoints.ldap
+
+    # Accounts
+    - src:
+        schema: pegleg/AccountCatalogue/v1
+        name: osh_infra_service_accounts
+        path: .osh_infra.prometheus.admin
+      dest:
+        path: .values.endpoints.prometheus.auth.admin
+
+    # Secrets
+    - dest:
+        path: .values.endpoints.prometheus.auth.admin.password
+      src:
+        schema: deckhand/Passphrase/v1
+        name: osh_infra_prometheus_admin_password
+        path: .
+
+    # LDAP Details
+    - src:
+        schema: pegleg/AccountCatalogue/v1
+        name: osh_infra_service_accounts
+        path: .osh_infra.ldap.admin
+      dest:
+        path: .values.endpoints.ldap.auth.admin
+    - dest:
+        path: .values.endpoints.ldap.auth.admin.password
+      src:
+        schema: deckhand/Passphrase/v1
+        name: osh_keystone_ldap_password
+        path: .
 
 data:
   chart_name: prometheus
@@ -72,9 +108,1545 @@ data:
     pod:
       replicas:
         prometheus: 3
+      resources:
+        enabled: true
+        prometheus:
+          limits:
+            memory: "64Gi"
+            cpu: "4000m"
+          requests:
+            memory: "16Gi"
+            cpu: "2000m"
     storage:
       requests:
         storage: 500Gi
+    conf:
+      prometheus:
+        command_line_flags:
+          storage.tsdb.max_block_duration: 17h
+        scrape_configs:
+          global:
+            scrape_interval: 60s
+            evaluation_interval: 60s
+          scrape_configs:
+            # NOTE(srwilkers): The job definition for Prometheus should always be
+            # listed first, so we can inject the basic auth username and password
+            # via the endpoints section
+            - job_name: 'prometheus-metrics'
+              kubernetes_sd_configs:
+              - role: endpoints
+              scrape_interval: 60s
+              relabel_configs:
+              - source_labels:
+                  - __meta_kubernetes_service_name
+                action: keep
+                regex: "prom-metrics"
+              - source_labels:
+                  - __meta_kubernetes_service_annotation_prometheus_io_scrape
+                action: keep
+                regex: true
+              - source_labels:
+                  - __meta_kubernetes_service_annotation_prometheus_io_scheme
+                action: replace
+                target_label: __scheme__
+                regex: (https?)
+              - source_labels:
+                  - __meta_kubernetes_service_annotation_prometheus_io_path
+                action: replace
+                target_label: __metrics_path__
+                regex: (.+)
+              - source_labels:
+                  - __address__
+                  - __meta_kubernetes_service_annotation_prometheus_io_port
+                action: replace
+                target_label: __address__
+                regex: ([^:]+)(?::\d+)?;(\d+)
+                replacement: $1:$2
+              - action: labelmap
+                regex: __meta_kubernetes_service_label_(.+)
+              - source_labels:
+                  - __meta_kubernetes_namespace
+                action: replace
+                target_label: kubernetes_namespace
+              - source_labels:
+                  - __meta_kubernetes_service_name
+                action: replace
+                target_label: instance
+              - source_labels:
+                  - __meta_kubernetes_service_name
+                action: replace
+                target_label: kubernetes_name
+              - source_labels:
+                  - __meta_kubernetes_service_name
+                target_label: job
+                replacement: ${1}
+            - job_name: kubelet
+              scheme: https
+              # This TLS & bearer token file config is used to connect to the actual scrape
+              # endpoints for cluster components. This is separate to discovery auth
+              # configuration because discovery & scraping are two separate concerns in
+              # Prometheus. The discovery auth config is automatic if Prometheus runs inside
+              # the cluster. Otherwise, more config options have to be provided within the
+              # <kubernetes_sd_config>.
+              tls_config:
+                ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+              bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+              kubernetes_sd_configs:
+              - role: node
+              scrape_interval: 45s
+              relabel_configs:
+              - action: labelmap
+                regex: __meta_kubernetes_node_label_(.+)
+              - target_label: __address__
+                replacement: kubernetes.default.svc:443
+              - source_labels:
+                  - __meta_kubernetes_node_name
+                regex: (.+)
+                target_label: __metrics_path__
+                replacement: /api/v1/nodes/${1}/proxy/metrics
+              - source_labels:
+                  - __meta_kubernetes_node_name
+                action: replace
+                target_label: kubernetes_io_hostname
+              # Scrape config for Kubelet cAdvisor.
+              #
+              # This is required for Kubernetes 1.7.3 and later, where cAdvisor metrics
+              # (those whose names begin with 'container_') have been removed from the
+              # Kubelet metrics endpoint.  This job scrapes the cAdvisor endpoint to
+              # retrieve those metrics.
+              #
+              # In Kubernetes 1.7.0-1.7.2, these metrics are only exposed on the cAdvisor
+              # HTTP endpoint; use "replacement: /api/v1/nodes/${1}:4194/proxy/metrics"
+              # in that case (and ensure cAdvisor's HTTP server hasn't been disabled with
+              # the --cadvisor-port=0 Kubelet flag).
+              #
+              # This job is not necessary and should be removed in Kubernetes 1.6 and
+              # earlier versions, or it will cause the metrics to be scraped twice.
+            - job_name: 'kubernetes-cadvisor'
+
+              # Default to scraping over https. If required, just disable this or change to
+              # `http`.
+              scheme: https
+
+              # This TLS & bearer token file config is used to connect to the actual scrape
+              # endpoints for cluster components. This is separate to discovery auth
+              # configuration because discovery & scraping are two separate concerns in
+              # Prometheus. The discovery auth config is automatic if Prometheus runs inside
+              # the cluster. Otherwise, more config options have to be provided within the
+              # <kubernetes_sd_config>.
+              tls_config:
+                ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+              bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+
+              kubernetes_sd_configs:
+              - role: node
+
+              relabel_configs:
+              - action: labelmap
+                regex: __meta_kubernetes_node_label_(.+)
+              - target_label: __address__
+                replacement: kubernetes.default.svc:443
+              - source_labels:
+                  - __meta_kubernetes_node_name
+                regex: (.+)
+                target_label: __metrics_path__
+                replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
+              metric_relabel_configs:
+              - source_labels:
+                  - __name__
+                regex: 'container_network_tcp_usage_total'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_tasks_state'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_network_udp_usage_total'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_memory_failures_total'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_cpu_load_average_10s'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_cpu_system_seconds_total'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_cpu_user_seconds_total'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_fs_inodes_free'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_fs_inodes_total'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_fs_io_current'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_fs_io_time_seconds_total'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_fs_io_time_weighted_seconds_total'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_fs_read_seconds_total'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_fs_reads_merged_total'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_fs_reads_merged_total'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_fs_reads_total'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_fs_sector_reads_total'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_fs_sector_writes_total'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_fs_write_seconds_total'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_fs_writes_bytes_total'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_fs_writes_merged_total'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_fs_writes_total'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_last_seen'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_memory_cache'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_memory_failcnt'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_memory_max_usage_bytes'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_memory_rss'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_memory_swap'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_memory_usage_bytes'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_network_receive_errors_total'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_network_receive_packets_dropped_total'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_network_receive_packets_total'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_network_transmit_errors_total'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_network_transmit_packets_dropped_total'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_network_transmit_packets_total'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_spec_cpu_period'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_spec_cpu_shares'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_spec_memory_limit_bytes'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_spec_memory_reservation_limit_bytes'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_spec_memory_swap_limit_bytes'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'container_start_time_seconds'
+                action: drop
+              # Scrape config for API servers.
+              #
+              # Kubernetes exposes API servers as endpoints to the default/kubernetes
+              # service so this uses `endpoints` role and uses relabelling to only keep
+              # the endpoints associated with the default/kubernetes service using the
+              # default named port `https`. This works for single API server deployments as
+              # well as HA API server deployments.
+            - job_name: 'apiserver'
+              kubernetes_sd_configs:
+              - role: endpoints
+              scrape_interval: 45s
+              # Default to scraping over https. If required, just disable this or change to
+              # `http`.
+              scheme: https
+              # This TLS & bearer token file config is used to connect to the actual scrape
+              # endpoints for cluster components. This is separate to discovery auth
+              # configuration because discovery & scraping are two separate concerns in
+              # Prometheus. The discovery auth config is automatic if Prometheus runs inside
+              # the cluster. Otherwise, more config options have to be provided within the
+              # <kubernetes_sd_config>.
+              tls_config:
+                ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+                # If your node certificates are self-signed or use a different CA to the
+                # master CA, then disable certificate verification below. Note that
+                # certificate verification is an integral part of a secure infrastructure
+                # so this should only be disabled in a controlled environment. You can
+                # disable certificate verification by uncommenting the line below.
+                #
+                # insecure_skip_verify: true
+              bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+              # Keep only the default/kubernetes service endpoints for the https port. This
+              # will add targets for each API server which Kubernetes adds an endpoint to
+              # the default/kubernetes service.
+              relabel_configs:
+              - source_labels:
+                  - __meta_kubernetes_namespace
+                  - __meta_kubernetes_service_name
+                  - __meta_kubernetes_endpoint_port_name
+                action: keep
+                regex: default;kubernetes;https
+              metric_relabel_configs:
+              - source_labels:
+                  - __name__
+                regex: 'apiserver_admission_controller_admission_latencies_seconds_bucket'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'rest_client_request_latency_seconds_bucket'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'apiserver_response_sizes_bucket'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'apiserver_admission_step_admission_latencies_seconds_bucket'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'apiserver_admission_controller_admission_latencies_seconds_count'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'apiserver_admission_controller_admission_latencies_seconds_sum'
+                action: drop
+              - source_labels:
+                  - __name__
+                regex: 'apiserver_request_latencies_summary'
+                action: drop
+            # Scrape config for service endpoints.
+            #
+            # The relabeling allows the actual service scrape endpoint to be configured
+            # via the following annotations:
+            #
+            # * `prometheus.io/scrape`: Only scrape services that have a value of `true`
+            # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
+            # to set this to `https` & most likely set the `tls_config` of the scrape config.
+            # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
+            # * `prometheus.io/port`: If the metrics are exposed on a different port to the
+            # service then set this appropriately.
+            - job_name: 'openstack-exporter'
+              kubernetes_sd_configs:
+              - role: endpoints
+              scrape_interval: 60s
+              relabel_configs:
+              - source_labels:
+                  - __meta_kubernetes_service_name
+                action: keep
+                regex: "openstack-metrics"
+              - source_labels:
+                  - __meta_kubernetes_service_annotation_prometheus_io_scrape
+                action: keep
+                regex: true
+              - source_labels:
+                  - __meta_kubernetes_service_annotation_prometheus_io_scheme
+                action: replace
+                target_label: __scheme__
+                regex: (https?)
+              - source_labels:
+                  - __meta_kubernetes_service_annotation_prometheus_io_path
+                action: replace
+                target_label: __metrics_path__
+                regex: (.+)
+              - source_labels:
+                  - __address__
+                  - __meta_kubernetes_service_annotation_prometheus_io_port
+                action: replace
+                target_label: __address__
+                regex: ([^:]+)(?::\d+)?;(\d+)
+                replacement: $1:$2
+              - action: labelmap
+                regex: __meta_kubernetes_service_label_(.+)
+              - source_labels:
+                  - __meta_kubernetes_namespace
+                action: replace
+                target_label: kubernetes_namespace
+              - source_labels:
+                  - __meta_kubernetes_service_name
+                action: replace
+                target_label: instance
+              - source_labels:
+                  - __meta_kubernetes_service_name
+                action: replace
+                target_label: kubernetes_name
+              - source_labels:
+                  - __meta_kubernetes_service_name
+                target_label: job
+                replacement: ${1}
+            - job_name: 'kubernetes-service-endpoints'
+              kubernetes_sd_configs:
+              - role: endpoints
+              scrape_interval: 60s
+              relabel_configs:
+              - source_labels:
+                  - __meta_kubernetes_service_name
+                action: drop
+                regex: '(openstack-metrics|prom-metrics)'
+              - source_labels:
+                  - __meta_kubernetes_service_annotation_prometheus_io_scrape
+                action: keep
+                regex: true
+              - source_labels:
+                  - __meta_kubernetes_service_annotation_prometheus_io_scheme
+                action: replace
+                target_label: __scheme__
+                regex: (https?)
+              - source_labels:
+                  - __meta_kubernetes_service_annotation_prometheus_io_path
+                action: replace
+                target_label: __metrics_path__
+                regex: (.+)
+              - source_labels:
+                  - __address__
+                  - __meta_kubernetes_service_annotation_prometheus_io_port
+                action: replace
+                target_label: __address__
+                regex: ([^:]+)(?::\d+)?;(\d+)
+                replacement: $1:$2
+              - action: labelmap
+                regex: __meta_kubernetes_service_label_(.+)
+              - source_labels:
+                  - __meta_kubernetes_namespace
+                action: replace
+                target_label: kubernetes_namespace
+              - source_labels:
+                  - __meta_kubernetes_service_name
+                action: replace
+                target_label: kubernetes_name
+              - source_labels:
+                  - __meta_kubernetes_service_name
+                target_label: job
+                replacement: ${1}
+            # Example scrape config for pods
+            #
+            # The relabeling allows the actual pod scrape endpoint to be configured via the
+            # following annotations:
+            #
+            # * `prometheus.io/scrape`: Only scrape pods that have a value of `true`
+            # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
+            # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the
+            # pod's declared ports (default is a port-free target if none are declared).
+            - job_name: 'kubernetes-pods'
+              kubernetes_sd_configs:
+              - role: pod
+              relabel_configs:
+              - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
+                action: keep
+                regex: true
+              - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
+                action: replace
+                target_label: __metrics_path__
+                regex: (.+)
+              - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
+                action: replace
+                regex: ([^:]+)(?::\d+)?;(\d+)
+                replacement: $1:$2
+                target_label: __address__
+              - action: labelmap
+                regex: __meta_kubernetes_pod_label_(.+)
+              - source_labels: [__meta_kubernetes_namespace]
+                action: replace
+                target_label: kubernetes_namespace
+              - source_labels: [__meta_kubernetes_pod_name]
+                action: replace
+                target_label: kubernetes_pod_name
+            - job_name: calico-etcd
+              kubernetes_sd_configs:
+              - role: service
+              scrape_interval: 20s
+              relabel_configs:
+              - action: labelmap
+                regex: __meta_kubernetes_service_label_(.+)
+              - action: keep
+                source_labels:
+                  - __meta_kubernetes_service_name
+                regex: "calico-etcd"
+              - action: keep
+                source_labels:
+                  - __meta_kubernetes_namespace
+                regex: kube-system
+                target_label: namespace
+              - source_labels:
+                  - __meta_kubernetes_pod_name
+                target_label: pod
+              - source_labels:
+                  - __meta_kubernetes_service_name
+                target_label: service
+              - source_labels:
+                  - __meta_kubernetes_service_name
+                target_label: job
+                replacement: ${1}
+              - source_labels:
+                  - __meta_kubernetes_service_label
+                target_label: job
+                regex: calico-etcd
+                replacement: ${1}
+              - target_label: endpoint
+                replacement: "calico-etcd"
+          alerting:
+            alertmanagers:
+            - kubernetes_sd_configs:
+                - role: pod
+              tls_config:
+                ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+              bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+              relabel_configs:
+              - source_labels: [__meta_kubernetes_pod_label_application]
+                regex: alertmanager
+                action: keep
+              - source_labels: [__meta_kubernetes_pod_container_port_name]
+                regex: alerts-api
+                action: keep
+              - source_labels: [__meta_kubernetes_pod_container_port_name]
+                regex: peer-mesh
+                action: drop
+        rules:
+          alertmanager:
+            groups:
+            - name: alertmanager.rules
+              rules:
+              - alert: AlertmanagerConfigInconsistent
+                expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
+                for: 5m
+                labels:
+                  severity: critical
+                annotations:
+                  description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.
+                  summary: Alertmanager configurations are inconsistent
+              - alert: AlertmanagerDownOrMissing
+                expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
+                for: 5m
+                labels:
+                  severity: warning
+                annotations:
+                  description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery.
+                  summary: Alertmanager down or not discovered
+              - alert: FailedReload
+                expr: alertmanager_config_last_reload_successful == 0
+                for: 10m
+                labels:
+                  severity: warning
+                annotations:
+                  description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}.
+                  summary: Alertmanager configuration reload has failed
+          etcd3:
+            groups:
+            - name: etcd3.rules
+              rules:
+              - alert: etcd_InsufficientMembers
+                expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
+                for: 3m
+                labels:
+                  severity: critical
+                annotations:
+                  description: If one more etcd member goes down the cluster will be unavailable
+                  summary: etcd cluster insufficient members
+              - alert: etcd_NoLeader
+                expr: etcd_server_has_leader{job="etcd"} == 0
+                for: 1m
+                labels:
+                  severity: critical
+                annotations:
+                  description: etcd member {{ $labels.instance }} has no leader
+                  summary: etcd member has no leader
+              - alert: etcd_HighNumberOfLeaderChanges
+                expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
+                labels:
+                  severity: warning
+                annotations:
+                  description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour
+                  summary: a high number of leader changes within the etcd cluster are happening
+              - alert: etcd_HighNumberOfFailedGRPCRequests
+                expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01
+                for: 10m
+                labels:
+                  severity: warning
+                annotations:
+                  description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
+                  summary: a high number of gRPC requests are failing
+              - alert: etcd_HighNumberOfFailedGRPCRequests
+                expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05
+                for: 5m
+                labels:
+                  severity: critical
+                annotations:
+                  description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
+                  summary: a high number of gRPC requests are failing
+              - alert: etcd_GRPCRequestsSlow
+                expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
+                for: 10m
+                labels:
+                  severity: critical
+                annotations:
+                  description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow
+                  summary: slow gRPC requests
+              - alert: etcd_HighNumberOfFailedHTTPRequests
+                expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.01
+                for: 10m
+                labels:
+                  severity: warning
+                annotations:
+                  description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
+                  summary: a high number of HTTP requests are failing
+              - alert: etcd_HighNumberOfFailedHTTPRequests
+                expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.05
+                for: 5m
+                labels:
+                  severity: critical
+                annotations:
+                  description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
+                  summary: a high number of HTTP requests are failing
+              - alert: etcd_HTTPRequestsSlow
+                expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
+                for: 10m
+                labels:
+                  severity: warning
+                annotations:
+                  description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow
+                  summary: slow HTTP requests
+              - alert: etcd_EtcdMemberCommunicationSlow
+                expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
+                for: 10m
+                labels:
+                  severity: warning
+                annotations:
+                  description: etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow
+                  summary: etcd member communication is slow
+              - alert: etcd_HighNumberOfFailedProposals
+                expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
+                labels:
+                  severity: warning
+                annotations:
+                  description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour
+                  summary: a high number of proposals within the etcd cluster are failing
+              - alert: etcd_HighFsyncDurations
+                expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
+                for: 10m
+                labels:
+                  severity: warning
+                annotations:
+                  description: etcd instance {{ $labels.instance }} fync durations are high
+                  summary: high fsync durations
+              - alert: etcd_HighCommitDurations
+                expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
+                for: 10m
+                labels:
+                  severity: warning
+                annotations:
+                  description: etcd instance {{ $labels.instance }} commit durations are high
+                  summary: high commit durations
+          kube_apiserver:
+            groups:
+            - name: kube-apiserver.rules
+              rules:
+              - alert: K8SApiserverDown
+                expr: absent(up{job="apiserver"} == 1)
+                for: 5m
+                labels:
+                  severity: critical
+                annotations:
+                  description: Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.
+                  summary: API server unreachable
+              - alert: K8SApiServerLatency
+                expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) WITHOUT (instance, resource)) / 1e+06 > 1
+                for: 10m
+                labels:
+                  severity: warning
+                annotations:
+                  description: 99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.
+                  summary: Kubernetes apiserver latency is high
+          kube_controller_manager:
+            groups:
+            - name: kube-controller-manager.rules
+              rules:
+              - alert: K8SControllerManagerDown
+                expr: absent(up{job="kube-controller-manager-discovery"} == 1)
+                for: 5m
+                labels:
+                  severity: critical
+                annotations:
+                  description: There is no running K8S controller manager. Deployments and replication controllers are not making progress.
+                  runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
+                  summary: Controller manager is down
+          kubelet:
+            groups:
+            - name: kubelet.rules
+              rules:
+              - alert: K8SNodeNotReady
+                expr: kube_node_status_ready{condition="true"} == 0
+                for: 1h
+                labels:
+                  severity: warning
+                annotations:
+                  description: The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour
+                  summary: Node status is NotReady
+              - alert: K8SManyNodesNotReady
+                expr: count(kube_node_status_ready{condition="true"} == 0) > 1 and (count(kube_node_status_ready{condition="true"} == 0) / count(kube_node_status_ready{condition="true"})) > 0.2
+                for: 1m
+                labels:
+                  severity: critical
+                annotations:
+                  description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).'
+                  summary: Many Kubernetes nodes are Not Ready
+              - alert: K8SKubeletDown
+                expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
+                for: 1h
+                labels:
+                  severity: warning
+                annotations:
+                  description: Prometheus failed to scrape {{ $value }}% of kubelets.
+                  summary: Many Kubelets cannot be scraped
+              - alert: K8SKubeletDown
+                expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1
+                for: 1h
+                labels:
+                  severity: critical
+                annotations:
+                  description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.
+                  summary: Many Kubelets cannot be scraped
+              - alert: K8SKubeletTooManyPods
+                expr: kubelet_running_pod_count > 100
+                labels:
+                  severity: warning
+                annotations:
+                  description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110
+                  summary: Kubelet is close to pod limit
+          kubernetes:
+            groups:
+            - name: kubernetes.rules
+              rules:
+              - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes
+                expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
+              - record: cluster_namespace_controller_pod_container:spec_cpu_shares
+                expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
+              - record: cluster_namespace_controller_pod_container:cpu_usage:rate
+                expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
+              - record: cluster_namespace_controller_pod_container:memory_usage:bytes
+                expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
+              - record: cluster_namespace_controller_pod_container:memory_working_set:bytes
+                expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
+              - record: cluster_namespace_controller_pod_container:memory_rss:bytes
+                expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
+              - record: cluster_namespace_controller_pod_container:memory_cache:bytes
+                expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
+              - record: cluster_namespace_controller_pod_container:disk_usage:bytes
+                expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
+              - record: cluster_namespace_controller_pod_container:memory_pagefaults:rate
+                expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type)
+              - record: cluster_namespace_controller_pod_container:memory_oom:rate
+                expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type)
+              - record: cluster:memory_allocation:percent
+                expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster)
+              - record: cluster:memory_used:percent
+                expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster)
+              - record: cluster:cpu_allocation:percent
+                expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"} * ON(cluster, instance) machine_cpu_cores) BY (cluster)
+              - record: cluster:node_cpu_use:percent
+                expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores) BY (cluster)
+              - record: cluster_resource_verb:apiserver_latency:quantile_seconds
+                expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
+                labels:
+                  quantile: "0.99"
+              - record: cluster_resource_verb:apiserver_latency:quantile_seconds
+                expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
+                labels:
+                  quantile: "0.9"
+              - record: cluster_resource_verb:apiserver_latency:quantile_seconds
+                expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
+                labels:
+                  quantile: "0.5"
+              - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
+                expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
+                labels:
+                  quantile: "0.99"
+              - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
+                expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
+                labels:
+                  quantile: "0.9"
+              - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
+                expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
+                labels:
+                  quantile: "0.5"
+              - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
+                expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
+                labels:
+                  quantile: "0.99"
+              - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
+                expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
+                labels:
+                  quantile: "0.9"
+              - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
+                expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
+                labels:
+                  quantile: "0.5"
+              - record: cluster:scheduler_binding_latency:quantile_seconds
+                expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
+                labels:
+                  quantile: "0.99"
+              - record: cluster:scheduler_binding_latency:quantile_seconds
+                expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
+                labels:
+                  quantile: "0.9"
+              - record: cluster:scheduler_binding_latency:quantile_seconds
+                expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
+                labels:
+                  quantile: "0.5"
+              - alert: kube_statefulset_replicas_unavailable
+                expr: kube_statefulset_status_replicas < kube_statefulset_replicas
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: 'statefulset {{$labels.statefulset}} has {{$value}} replicas, which is less than desired'
+                  summary: '{{$labels.statefulset}}: has inssuficient replicas.'
+              - alert: kube_daemonsets_misscheduled
+                expr: kube_daemonset_status_number_misscheduled > 0
+                for: 10m
+                labels:
+                  severity: warning
+                annotations:
+                  description: 'Daemonset {{$labels.daemonset}} is running where it is not supposed to run'
+                  summary: 'Daemonsets not scheduled correctly'
+              - alert: kube_daemonsets_not_scheduled
+                expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
+                for: 10m
+                labels:
+                  severity: warning
+                annotations:
+                  description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number'
+                  summary: 'Less than desired number of daemonsets scheduled'
+              - alert: kube_deployment_replicas_unavailable
+                expr: kube_deployment_status_replicas_unavailable > 0
+                for: 10m
+                labels:
+                  severity: page
+                annotations:
+                  description: 'deployment {{$labels.deployment}} has {{$value}} replicas unavailable'
+                  summary: '{{$labels.deployment}}: has inssuficient replicas.'
+              - alert: kube_rollingupdate_deployment_replica_less_than_spec_max_unavailable
+                expr: kube_deployment_status_replicas_available - kube_deployment_spec_strategy_rollingupdate_max_unavailable < 0
+                for: 10m
+                labels:
+                  severity: page
+                annotations:
+                  description: 'deployment {{$labels.deployment}} has {{$value}} replicas available which is less than specified as max unavailable during a rolling update'
+                  summary: '{{$labels.deployment}}: has inssuficient replicas during a rolling update.'
+              - alert: kube_job_status_failed
+                expr: kube_job_status_failed > 0
+                for: 10m
+                labels:
+                  severity: page
+                annotations:
+                  description: 'Job {{$labels.exported_job}} is in failed status'
+                  summary: '{{$labels.exported_job}} has failed status'
+              - alert: kube_pod_status_pending
+                expr: kube_pod_status_phase{phase="Pending"} == 1
+                for: 10m
+                labels:
+                  severity: page
+                annotations:
+                  description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has been in pending status for more than 10 minutes'
+                  summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in pending status'
+              - alert: kube_pod_error_image_pull
+                expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
+                for: 10m
+                labels:
+                  severity: page
+                annotations:
+                  description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
+                  summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
+              - alert: kube_pod_status_error_image_pull
+                expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
+                for: 10m
+                labels:
+                  severity: page
+                annotations:
+                  description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
+                  summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
+              - alert: kube_replicaset_missing_replicas
+                expr:  kube_replicaset_spec_replicas -  kube_replicaset_status_ready_replicas > 0
+                for: 10m
+                labels:
+                  severity: page
+                annotations:
+                  description: 'Replicaset {{$labels.replicaset}} is missing desired number of replicas for more than 10 minutes'
+                  summary: 'Replicaset {{$labels.replicaset}} is missing replicas'
+              - alert: kube_pod_container_terminated
+                expr: kube_pod_container_status_terminated_reason{reason=~"OOMKilled|Error|ContainerCannotRun"} > 0
+                for: 10m
+                labels:
+                  severity: page
+                annotations:
+                  description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a container terminated for more than 10 minutes'
+                  summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
+              - alert: volume_claim_capacity_high_utilization
+                expr: (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.80
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: 'volume claim {{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity'
+                  summary: '{{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity.'
+          basic_linux:
+            groups:
+            - name: basic_linux.rules
+              rules:
+              - alert: node_filesystem_full_80percent
+                expr: sort(node_filesystem_free{device!="ramfs"} < node_filesystem_size{device!="ramfs"}
+                  * 0.2) / 1024 ^ 3
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}}
+                    got less than 10% space left on its filesystem.'
+                  summary: '{{$labels.alias}}: Filesystem is running out of space soon.'
+              - alert: node_filesystem_full_in_4h
+                expr: predict_linear(node_filesystem_free{device!="ramfs"}[1h], 4 * 3600) <= 0
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}}
+                    is running out of space of in approx. 4 hours'
+                  summary: '{{$labels.alias}}: Filesystem is running out of space in 4 hours.'
+              - alert: node_filedescriptors_full_in_3h
+                expr: predict_linear(node_filefd_allocated[1h], 3 * 3600) >= node_filefd_maximum
+                for: 20m
+                labels:
+                  severity: page
+                annotations:
+                  description: '{{$labels.alias}} is running out of available file descriptors
+                    in approx. 3 hours'
+                  summary: '{{$labels.alias}} is running out of available file descriptors in
+                    3 hours.'
+              - alert: node_load1_90percent
+                expr: node_load1 / ON(alias) count(node_cpu{mode="system"}) BY (alias) >= 0.9
+                for: 1h
+                labels:
+                  severity: page
+                annotations:
+                  description: '{{$labels.alias}} is running with > 90% total load for at least
+                    1h.'
+                  summary: '{{$labels.alias}}: Running on high load.'
+              - alert: node_cpu_util_90percent
+                expr: 100 - (avg(irate(node_cpu{mode="idle"}[5m])) BY (alias) * 100) >= 90
+                for: 1h
+                labels:
+                  severity: page
+                annotations:
+                  description: '{{$labels.alias}} has total CPU utilization over 90% for at least
+                    1h.'
+                  summary: '{{$labels.alias}}: High CPU utilization.'
+              - alert: node_ram_using_90percent
+                expr: node_memory_MemFree + node_memory_Buffers + node_memory_Cached < node_memory_MemTotal
+                  * 0.1
+                for: 30m
+                labels:
+                  severity: page
+                annotations:
+                  description: '{{$labels.alias}} is using at least 90% of its RAM for at least
+                    30 minutes now.'
+                  summary: '{{$labels.alias}}: Using lots of RAM.'
+              - alert: node_swap_using_80percent
+                expr: node_memory_SwapTotal - (node_memory_SwapFree + node_memory_SwapCached)
+                  > node_memory_SwapTotal * 0.8
+                for: 10m
+                labels:
+                  severity: page
+                annotations:
+                  description: '{{$labels.alias}} is using 80% of its swap space for at least
+                    10 minutes now.'
+                  summary: '{{$labels.alias}}: Running out of swap soon.'
+              - alert: node_high_cpu_load
+                expr: node_load15 / on(alias) count(node_cpu{mode="system"}) by (alias) >= 0
+                for: 1m
+                labels:
+                  severity: warning
+                annotations:
+                  description: '{{$labels.alias}} is running with load15 > 1 for at least 5 minutes: {{$value}}'
+                  summary: '{{$labels.alias}}: Running on high load: {{$value}}'
+              - alert: node_high_memory_load
+                expr: (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers
+                  + node_memory_Cached)) / sum(node_memory_MemTotal) * 100 > 85
+                for: 1m
+                labels:
+                  severity: warning
+                annotations:
+                  description: Host memory usage is {{ humanize $value }}%. Reported by
+                    instance {{ $labels.instance }} of job {{ $labels.job }}.
+                  summary: Server memory is almost full
+              - alert: node_high_storage_load
+                expr: (node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})
+                  / node_filesystem_size{mountpoint="/"} * 100 > 85
+                for: 30s
+                labels:
+                  severity: warning
+                annotations:
+                  description: Host storage usage is {{ humanize $value }}%. Reported by
+                    instance {{ $labels.instance }} of job {{ $labels.job }}.
+                  summary: Server storage is almost full
+              - alert: node_high_swap
+                expr: (node_memory_SwapTotal - node_memory_SwapFree) < (node_memory_SwapTotal
+                  * 0.4)
+                for: 1m
+                labels:
+                  severity: warning
+                annotations:
+                  description: Host system has a high swap usage of {{ humanize $value }}. Reported
+                    by instance {{ $labels.instance }} of job {{ $labels.job }}.
+                  summary: Server has a high swap usage
+              - alert: node_high_network_drop_rcv
+                expr: node_network_receive_drop{device!="lo"} > 3000
+                for: 30s
+                labels:
+                  severity: warning
+                annotations:
+                  description: Host system has an unusally high drop in network reception ({{
+                    humanize $value }}). Reported by instance {{ $labels.instance }} of job {{
+                    $labels.job }}
+                  summary: Server has a high receive drop
+              - alert: node_high_network_drop_send
+                expr: node_network_transmit_drop{device!="lo"} > 3000
+                for: 30s
+                labels:
+                  severity: warning
+                annotations:
+                  description: Host system has an unusally high drop in network transmission ({{
+                    humanize $value }}). Reported by instance {{ $labels.instance }} of job {{
+                    $labels.job }}
+                  summary: Server has a high transmit drop
+              - alert: node_high_network_errs_rcv
+                expr: node_network_receive_errs{device!="lo"} > 3000
+                for: 30s
+                labels:
+                  severity: warning
+                annotations:
+                  description: Host system has an unusally high error rate in network reception
+                    ({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job
+                    {{ $labels.job }}
+                  summary: Server has unusual high reception errors
+              - alert: node_high_network_errs_send
+                expr: node_network_transmit_errs{device!="lo"} > 3000
+                for: 30s
+                labels:
+                  severity: warning
+                annotations:
+                  description: Host system has an unusally high error rate in network transmission
+                    ({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job
+                    {{ $labels.job }}
+                  summary: Server has unusual high transmission errors
+              - alert: node_network_conntrack_usage_80percent
+                expr: sort(node_nf_conntrack_entries{job="node-exporter"} > node_nf_conntrack_entries_limit{job="node-exporter"}  * 0.8)
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: '{{$labels.instance}} has network conntrack entries of {{ $value }} which is more than 80% of maximum limit'
+                  summary: '{{$labels.instance}}: available network conntrack entries are low.'
+              - alert: node_entropy_available_low
+                expr: node_entropy_available_bits < 300
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: '{{$labels.instance}} has available entropy bits of {{ $value }} which is less than required of 300'
+                  summary: '{{$labels.instance}}: is low on entropy bits.'
+              - alert: node_hwmon_high_cpu_temp
+                expr: node_hwmon_temp_crit_celsius*0.9 - node_hwmon_temp_celsius < 0 OR node_hwmon_temp_max_celsius*0.95 - node_hwmon_temp_celsius < 0
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: '{{$labels.alias}} reports hwmon sensor {{$labels.sensor}}/{{$labels.chip}} temperature value is nearly critical: {{$value}}'
+                  summary: '{{$labels.alias}}: Sensor {{$labels.sensor}}/{{$labels.chip}} temp is high: {{$value}}'
+              - alert: node_vmstat_paging_rate_high
+                expr: irate(node_vmstat_pgpgin[5m]) > 80
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: '{{$labels.alias}} has a memory paging rate of change higher than 80%: {{$value}}'
+                  summary: '{{$labels.alias}}: memory paging rate is high: {{$value}}'
+              - alert: node_xfs_block_allocation_high
+                expr: 100*(node_xfs_extent_allocation_blocks_allocated_total{job="node-exporter", instance=~"172.17.0.1.*"} / (node_xfs_extent_allocation_blocks_freed_total{job="node-exporter", instance=~"172.17.0.1.*"} + node_xfs_extent_allocation_blocks_allocated_total{job="node-exporter", instance=~"172.17.0.1.*"})) > 80
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: '{{$labels.alias}} has xfs allocation blocks higher than 80%: {{$value}}'
+                  summary: '{{$labels.alias}}: xfs block allocation high: {{$value}}'
+              - alert: node_network_bond_slaves_down
+                expr: node_net_bonding_slaves - node_net_bonding_slaves_active > 0
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: '{{ $labels.master }} is missing {{ $value }} slave interface(s).'
+                  summary: 'Instance {{ $labels.instance }}: {{ $labels.master }} missing {{ $value }} slave interface(s)'
+              - alert: node_numa_memory_used
+                expr: 100*node_memory_numa_MemUsed / node_memory_numa_MemTotal > 80
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: '{{$labels.alias}} has more than 80% NUMA memory usage: {{ $value }}'
+                  summary: '{{$labels.alias}}: has high NUMA memory usage: {{$value}}'
+              - alert: node_ntp_clock_skew_high
+                expr: abs(node_ntp_drift_seconds) > 2
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: '{{$labels.alias}} has time difference of more than 2 seconds compared to NTP server: {{ $value }}'
+                  summary: '{{$labels.alias}}: time is skewed by : {{$value}} seconds'
+              - alert: node_disk_read_latency
+                expr: (rate(node_disk_read_time_ms[5m]) / rate(node_disk_reads_completed[5m])) > 10
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: '{{$labels.device}} has a high read latency of {{ $value }}'
+                  summary: 'High read latency observed for device {{ $labels.device }}'
+              - alert: node_disk_write_latency
+                expr: (rate(node_disk_write_time_ms[5m]) / rate(node_disk_writes_completed[5m])) > 10
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: '{{$labels.device}} has a high write latency of {{ $value }}'
+                  summary: 'High write latency observed for device {{ $labels.device }}'
+          openstack:
+            groups:
+            - name: openstack.rules
+              rules:
+              - alert: os_glance_api_availability
+                expr:  check_glance_api != 1
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: 'Glance API is not available at {{$labels.url}} for more than 5 minutes'
+                  summary: 'Glance API is not available at {{$labels.url}}'
+              - alert: os_nova_api_availability
+                expr:  check_nova_api != 1
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: 'Nova API is not available at {{$labels.url}} for more than 5 minutes'
+                  summary: 'Nova API is not available at {{$labels.url}}'
+              - alert: os_keystone_api_availability
+                expr:  check_keystone_api != 1
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: 'Keystone API is not available at {{$labels.url}} for more than 5 minutes'
+                  summary: 'Keystone API is not available at {{$labels.url}}'
+              - alert: os_neutron_api_availability
+                expr:  check_neutron_api != 1
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: 'Neutron API is not available at {{$labels.url}} for more than 5 minutes'
+                  summary: 'Neutron API is not available at {{$labels.url}}'
+              - alert: os_swift_api_availability
+                expr:  check_swift_api != 1
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: 'Swift API is not available at {{$labels.url}} for more than 5 minutes'
+                  summary: 'Swift API is not available at {{$labels.url}}'
+              - alert: os_nova_compute_disabled
+                expr:  services_nova_compute_disabled_total > 0
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: 'nova-compute is disabled on certain hosts for more than 5 minutes'
+                  summary: 'Openstack compute service nova-compute is disabled on some hosts'
+              - alert: os_nova_conductor_disabled
+                expr:  services_nova_conductor_disabled_total > 0
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: 'nova-conductor is disabled on certain hosts for more than 5 minutes'
+                  summary: 'Openstack compute service nova-conductor is disabled on some hosts'
+              - alert: os_nova_consoleauth_disabled
+                expr:  services_nova_consoleauth_disabled_total > 0
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: 'nova-consoleauth is disabled on certain hosts for more than 5 minutes'
+                  summary: 'Openstack compute service nova-consoleauth is disabled on some hosts'
+              - alert: os_nova_scheduler_disabled
+                expr:  services_nova_scheduler_disabled_total > 0
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: 'nova-scheduler is disabled on certain hosts for more than 5 minutes'
+                  summary: 'Openstack compute service nova-scheduler is disabled on some hosts'
+          ceph:
+            groups:
+            - name: ceph.rules
+              rules:
+              - alert: ceph_monitor_quorum_low
+                expr:  ceph_monitor_quorum_count < 3
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: 'ceph monitor quorum has been less than 3 for more than 5 minutes'
+                  summary: 'ceph high availability is at risk'
+              - alert: ceph_cluster_usage_high
+                expr:  100* ceph_cluster_used_bytes/ceph_cluster_capacity_bytes > 80
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: 'ceph cluster capacity usage more than 80 percent'
+                  summary: 'ceph cluster usage is more than 80 percent'
+              - alert: ceph_placement_group_degrade_pct_high
+                expr:  100*ceph_degraded_pgs/ceph_total_pgs > 80
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: 'ceph placement group degradation is more than 80 percent'
+                  summary: 'ceph placement groups degraded'
+              - alert: ceph_osd_down_pct_high
+                expr:  100* ceph_osds_down/(ceph_osds_down+ceph_osds_up) > 80
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: 'ceph OSDs down percent is more than 80 percent'
+                  summary: 'ceph OSDs down percent is high'
+              - alert: ceph_monitor_clock_skew_high
+                expr:  ceph_monitor_clock_skew_seconds > 2
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: 'ceph monitors clock skew on {{$labels.instance}} is more than 2 seconds'
+                  summary: 'ceph monitor clock skew high'
+          fluentd:
+            groups:
+            - name: fluentd.rules
+              rules:
+              - alert: fluentd_not_running
+                expr:  fluentd_up == 0
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: 'fluentd is down on {{$labels.instance}} for more than 5 minutes'
+                  summary: 'Fluentd is down'
+          calico:
+            groups:
+            - name: calico.rules
+              rules:
+              - alert: calico_datapane_failures_high_1h
+                expr: absent(felix_int_dataplane_failures) OR increase(felix_int_dataplane_failures[1h]) > 5
+                labels:
+                  severity: page
+                annotations:
+                  description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} dataplane failures within the last hour'
+                  summary: 'A high number of dataplane failures within Felix are happening'
+              - alert: calico_datapane_address_msg_batch_size_high_5m
+                expr: absent(felix_int_dataplane_addr_msg_batch_size_sum) OR absent(felix_int_dataplane_addr_msg_batch_size_count) OR (felix_int_dataplane_addr_msg_batch_size_sum/felix_int_dataplane_addr_msg_batch_size_count) > 5
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: 'Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane address message batch size'
+                  summary: 'Felix address message batch size is higher'
+              - alert: calico_datapane_iface_msg_batch_size_high_5m
+                expr: absent(felix_int_dataplane_iface_msg_batch_size_sum) OR absent(felix_int_dataplane_iface_msg_batch_size_count) OR (felix_int_dataplane_iface_msg_batch_size_sum/felix_int_dataplane_iface_msg_batch_size_count) > 5
+                for: 5m
+                labels:
+                  severity: page
+                annotations:
+                  description: 'Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane interface message batch size'
+                  summary: 'Felix interface message batch size is higher'
+              - alert: calico_ipset_errors_high_1h
+                expr: absent(felix_ipset_errors) OR increase(felix_ipset_errors[1h]) > 5
+                labels:
+                  severity: page
+                annotations:
+                  description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} ipset errors within the last hour'
+                  summary: 'A high number of ipset errors within Felix are happening'
+              - alert: calico_iptable_save_errors_high_1h
+                expr: absent(felix_iptables_save_errors) OR increase(felix_iptables_save_errors[1h]) > 5
+                labels:
+                  severity: page
+                annotations:
+                  description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable save errors within the last hour'
+                  summary: 'A high number of iptable save errors within Felix are happening'
+              - alert: calico_iptable_restore_errors_high_1h
+                expr: absent(felix_iptables_restore_errors) OR increase(felix_iptables_restore_errors[1h]) > 5
+                labels:
+                  severity: page
+                annotations:
+                  description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable restore errors within the last hour'
+                  summary: 'A high number of iptable restore errors within Felix are happening'
+          rabbitmq:
+            groups:
+            - name: rabbitmq.rules
+              rules:
+              - alert: rabbitmq_network_pratitions_detected
+                expr: min(partitions) by(instance) > 0
+                for: 10m
+                labels:
+                  severity: warning
+                annotations:
+                  description: 'RabbitMQ at {{ $labels.instance }} has {{ $value }} partitions'
+                  summary: 'RabbitMQ Network partitions detected'
+              - alert: rabbitmq_down
+                expr:  min(rabbitmq_up) by(instance) != 1
+                for: 10m
+                labels:
+                  severity: page
+                annotations:
+                  description: 'RabbitMQ Server instance {{ $labels.instance }} is down'
+                  summary: 'The RabbitMQ Server instance at {{ $labels.instance }} has been down the last 10 mins'
+              - alert: rabbitmq_file_descriptor_usage_high
+                expr:  fd_used * 100 /fd_total > 80
+                for: 10m
+                labels:
+                  severity: warning
+                annotations:
+                  description: 'RabbitMQ Server instance {{ $labels.instance }} has high file descriptor usage of {{ $value }} percent.'
+                  summary: 'RabbitMQ file descriptors usage is high for last 10 mins'
+              - alert: rabbitmq_node_disk_free_alarm
+                expr:  node_disk_free_alarm > 0
+                for: 10m
+                labels:
+                  severity: warning
+                annotations:
+                  description: 'RabbitMQ Server instance {{ $labels.instance }} has low disk free space available.'
+                  summary: 'RabbitMQ disk space usage is high'
+              - alert: rabbitmq_node_memory_alarm
+                expr:  node_mem_alarm > 0
+                for: 10m
+                labels:
+                  severity: warning
+                annotations:
+                  description: 'RabbitMQ Server instance {{ $labels.instance }} has low free memory.'
+                  summary: 'RabbitMQ memory usage is high'
+              - alert: rabbitmq_less_than_3_nodes
+                expr:  running < 3
+                for: 10m
+                labels:
+                  severity: warning
+                annotations:
+                  description: 'RabbitMQ Server has less than 3 nodes running.'
+                  summary: 'RabbitMQ server is at risk of loosing data'
+              - alert: rabbitmq_queue_messages_returned_high
+                expr:  queue_messages_returned_total/queue_messages_published_total * 100 > 50
+                for: 5m
+                labels:
+                  severity: warning
+                annotations:
+                  description: 'RabbitMQ Server is returing more than 50 percent of messages received.'
+                  summary: 'RabbitMQ server is returning more than 50 percent of messages received.'
+              - alert: rabbitmq_consumers_low_utilization
+                expr:  queue_consumer_utilisation < .4
+                for: 5m
+                labels:
+                  severity: warning
+                annotations:
+                  description: 'RabbitMQ consumers message consumption speed is low'
+                  summary: 'RabbitMQ consumers message consumption speed is low'
+              - alert: rabbitmq_high_message_load
+                expr:  queue_messages_total > 17000 or increase(queue_messages_total[5m]) > 4000
+                for: 5m
+                labels:
+                  severity: warning
+                annotations:
+                  description: 'RabbitMQ has high message load. Total Queue depth > 17000 or growth more than 4000 messages.'
+                  summary: 'RabbitMQ has high message load'
+          elasticsearch:
+            groups:
+            - name: elasticsearch.rules
+              rules:
+              - alert: es_high_process_open_files_count
+                expr: sum(elasticsearch_process_open_files_count) by (host) > 64000
+                for: 10m
+                labels:
+                  severity: warning
+                annotations:
+                  description: 'Elasticsearch at {{ $labels.host }} has more than 64000 process open file count.'
+                  summary: 'Elasticsearch has a very high process open file count.'
+              - alert: es_high_process_cpu_percent
+                expr: elasticsearch_process_cpu_percent > 95
+                for: 10m
+                labels:
+                  severity: warning
+                annotations:
+                  description: 'Elasticsearch at {{ $labels.instance }} has high process cpu percent of {{ $value }}.'
+                  summary: 'Elasticsearch process cpu usage is more than 95 percent.'
+              - alert: es_fs_usage_high
+                expr: (100 * (elasticsearch_filesystem_data_size_bytes - elasticsearch_filesystem_data_free_bytes) / elasticsearch_filesystem_data_size_bytes) > 80
+                for: 10m
+                labels:
+                  severity: warning
+                annotations:
+                  description: 'Elasticsearch at {{ $labels.instance }} has filesystem usage of {{ $value }}.'
+                  summary: 'Elasticsearch filesystem usage is high.'
+              - alert: es_unassigned_shards
+                expr: elasticsearch_cluster_health_unassigned_shards > 0
+                for: 10m
+                labels:
+                  severity: warning
+                annotations:
+                  description: 'Elasticsearch has {{ $value }} unassigned shards.'
+                  summary: 'Elasticsearch has unassigned shards and hence a unhealthy cluster state.'
+              - alert: es_cluster_health_timed_out
+                expr: elasticsearch_cluster_health_timed_out > 0
+                for: 10m
+                labels:
+                  severity: warning
+                annotations:
+                  description: 'Elasticsearch cluster health status call timedout {{ $value }} times.'
+                  summary: 'Elasticsearch cluster health status calls are timing out.'
+              - alert: es_cluster_health_status_alert
+                expr: elasticsearch_cluster_health_status > 0
+                for: 10m
+                labels:
+                  severity: warning
+                annotations:
+                  description: 'Elasticsearch cluster health status is not green. One or more shards or replicas are unallocated.'
+                  summary: 'Elasticsearch cluster health status is not green.'
+              - alert: es_cluster_health_too_few_nodes_running
+                expr: elasticsearch_cluster_health_number_of_nodes < 3
+                for: 10m
+                labels:
+                  severity: warning
+                annotations:
+                  description: 'There are only {{$value}} < 3 ElasticSearch nodes running'
+                  summary: 'ElasticSearch running on less than 3 nodes'
+              - alert: es_cluster_health_too_few_data_nodes_running
+                expr: elasticsearch_cluster_health_number_of_data_nodes < 3
+                for: 10m
+                labels:
+                  severity: warning
+                annotations:
+                  description: 'There are only {{$value}} < 3 ElasticSearch data nodes running'
+                  summary: 'ElasticSearch running on less than 3 data nodes'
+          mariadb:
+            groups:
+            - name: mariadb.rules
+              rules:
+              - alert: mariadb_table_lock_wait_high
+                expr: 100 * mysql_global_status_table_locks_waited/(mysql_global_status_table_locks_waited + mysql_global_status_table_locks_immediate) > 30
+                for: 10m
+                labels:
+                  severity: warning
+                annotations:
+                  description: 'Mariadb has high table lock waits of {{ $value }} percentage'
+                  summary: 'Mariadb table lock waits are high'
+              - alert: mariadb_node_not_ready
+                expr:  mysql_global_status_wsrep_ready != 1
+                for: 10m
+                labels:
+                  severity: warning
+                annotations:
+                  description: '{{$labels.job}} on {{$labels.instance}} is not ready.'
+                  summary: 'Galera cluster node not ready'
+              - alert: mariadb_galera_node_out_of_sync
+                expr:  mysql_global_status_wsrep_local_state != 4 AND mysql_global_variables_wsrep_desync == 0
+                for: 10m
+                labels:
+                  severity: warning
+                annotations:
+                  description: '{{$labels.job}} on {{$labels.instance}} is not in sync ({{$value}} != 4)'
+                  summary: 'Galera cluster node out of sync'
+              - alert: mariadb_innodb_replication_fallen_behind
+                expr:  (mysql_global_variables_innodb_replication_delay > 30) AND on (instance) (predict_linear(mysql_global_variables_innodb_replication_delay[5m], 60*2) > 0)
+                for: 10m
+                labels:
+                  severity: warning
+                annotations:
+                  description: 'The mysql innodb replication has fallen behind and is not recovering'
+                  summary: 'MySQL innodb replication is lagging'
   dependencies:
     - osh-infra-helm-toolkit
 ...
diff --git a/global/software/charts/osh/openstack-mariadb/mariadb.yaml b/global/software/charts/osh/openstack-mariadb/mariadb.yaml
index cae3239a4..d93ef70a0 100644
--- a/global/software/charts/osh/openstack-mariadb/mariadb.yaml
+++ b/global/software/charts/osh/openstack-mariadb/mariadb.yaml
@@ -31,6 +31,13 @@ metadata:
         path: .osh.oslo_db
       dest:
         path: .values.endpoints.olso_db
+    - src:
+        schema: pegleg/EndpointCatalogue/v1
+        name: osh_endpoints
+        path: .osh.prometheus_mysql_exporter
+      dest:
+        path: .values.endpoints.prometheus_mysql_exporter
+
     # Accounts
     - src:
         schema: pegleg/AccountCatalogue/v1
@@ -38,6 +45,12 @@ metadata:
         path: .osh.oslo_db.admin
       dest:
         path: .values.endpoints.oslo_db.auth.admin
+    - src:
+        schema: pegleg/AccountCatalogue/v1
+        name: osh_service_accounts
+        path: .osh.prometheus_mysql_exporter.user
+      dest:
+        path: .values.endpoints.prometheus_mysql_exporter.auth.user
 
     # Secrets
     - dest:
@@ -46,6 +59,12 @@ metadata:
         schema: deckhand/Passphrase/v1
         name: osh_oslo_db_admin_password
         path: .
+    - dest:
+        path: .values.endpoints.oslo_db.auth.exporter.password
+      src:
+        schema: deckhand/Passphrase/v1
+        name: osh_oslo_db_exporter_password
+        path: .
 
 data:
   chart_name: openstack-mariadb
@@ -72,6 +91,9 @@ data:
       prometheus_mysql_exporter:
         node_selector_key: openstack-control-plane
         node_selector_value: enabled
+    monitoring:
+      prometheus:
+        enabled: true
   dependencies:
     - osh-helm-toolkit
 ...
diff --git a/global/software/charts/ucp/ceph/ceph-mon.yaml b/global/software/charts/ucp/ceph/ceph-mon.yaml
index dba3128ba..8cafe3549 100644
--- a/global/software/charts/ucp/ceph/ceph-mon.yaml
+++ b/global/software/charts/ucp/ceph/ceph-mon.yaml
@@ -130,8 +130,6 @@ data:
         namespace: ceph
       ceph_mon:
         namespace: ceph
-      fluentd:
-        namespace: osh-infra
     deployment:
       ceph: true
       storage_secrets: true
diff --git a/global/software/charts/ucp/ceph/ceph-osd.yaml b/global/software/charts/ucp/ceph/ceph-osd.yaml
index 618d230f8..aea2672e1 100644
--- a/global/software/charts/ucp/ceph/ceph-osd.yaml
+++ b/global/software/charts/ucp/ceph/ceph-osd.yaml
@@ -122,8 +122,6 @@ data:
         namespace: ceph
       ceph_mon:
         namespace: ceph
-      fluentd:
-        namespace: osh-infra
     bootstrap:
       enabled: true
     conf:
diff --git a/global/software/charts/ucp/ucp-openstack-exporter/chart-group.yaml b/global/software/charts/ucp/ucp-openstack-exporter/chart-group.yaml
new file mode 100644
index 000000000..3d9407ea1
--- /dev/null
+++ b/global/software/charts/ucp/ucp-openstack-exporter/chart-group.yaml
@@ -0,0 +1,13 @@
+---
+schema: armada/ChartGroup/v1
+metadata:
+  schema: metadata/Document/v1
+  name: ucp-prometheus-openstack-exporter
+  layeringDefinition:
+    abstract: false
+    layer: global
+  storagePolicy: cleartext
+data:
+  description: Prometheus OpenStack Exporter for UCP Components
+  chart_group:
+    - ucp-prometheus-openstack-exporter
diff --git a/global/software/charts/ucp/ucp-openstack-exporter/prometheus-openstack-exporter.yaml b/global/software/charts/ucp/ucp-openstack-exporter/prometheus-openstack-exporter.yaml
new file mode 100644
index 000000000..10c8bdd58
--- /dev/null
+++ b/global/software/charts/ucp/ucp-openstack-exporter/prometheus-openstack-exporter.yaml
@@ -0,0 +1,95 @@
+---
+schema: armada/Chart/v1
+metadata:
+  schema: metadata/Document/v1
+  name: ucp-prometheus-openstack-exporter
+  layeringDefinition:
+    abstract: false
+    layer: global
+  storagePolicy: cleartext
+  substitutions:
+    # Chart source
+    - src:
+        schema: pegleg/SoftwareVersions/v1
+        name: software-versions
+        path: .charts.osh_infra.prometheus_openstack_exporter
+      dest:
+        path: .source
+
+    # Images
+    - src:
+        schema: pegleg/SoftwareVersions/v1
+        name: software-versions
+        path: .images.osh_infra.prometheus_openstack_exporter
+      dest:
+        path: .values.images.tags
+
+    # Endpoints
+    - src:
+        schema: pegleg/EndpointCatalogue/v1
+        name: ucp_endpoints
+        path: .ucp.prometheus_openstack_exporter
+      dest:
+        path: .values.endpoints.prometheus_openstack_exporter
+    - src:
+        schema: pegleg/EndpointCatalogue/v1
+        name: ucp_endpoints
+        path: .ucp.identity
+      dest:
+        path: .values.endpoints.identity
+
+    # Accounts
+    - src:
+        schema: pegleg/AccountCatalogue/v1
+        name: ucp_service_accounts
+        path: .ucp.keystone.admin
+      dest:
+        path: .values.endpoints.identity.auth.admin
+    - src:
+        schema: pegleg/AccountCatalogue/v1
+        name: ucp_service_accounts
+        path: .ucp.prometheus_openstack_exporter.user
+      dest:
+        path: .values.endpoints.identity.auth.user
+
+    # Secrets
+    - dest:
+        path: .values.endpoints.identity.auth.admin.password
+      src:
+        schema: deckhand/Passphrase/v1
+        name: ucp_keystone_admin_password
+        path: .
+    - dest:
+        path: .values.endpoints.identity.auth.user.password
+      src:
+        schema: deckhand/Passphrase/v1
+        name: ucp_openstack_exporter_keystone_password
+        path: .
+data:
+  chart_name: ucp-prometheus-openstack-exporter
+  release: ucp-prometheus-openstack-exporter
+  namespace: ucp
+  wait:
+    timeout: 900
+    labels:
+      release_group: airship-ucp-prometheus-openstack-exporter
+  install:
+    no_hooks: false
+  upgrade:
+    no_hooks: false
+    pre:
+      delete:
+        - type: job
+          labels:
+            release_group: airship-ucp-prometheus-openstack-exporter
+  values:
+    labels:
+      openstack_exporter:
+        node_selector_key: openstack-control-plane
+        node_selector_value: enabled
+      job:
+        node_selector_key: openstack-control-plane
+        node_selector_value: enabled
+  dependencies:
+    - osh-infra-helm-toolkit
+...
diff --git a/global/software/config/versions.yaml b/global/software/config/versions.yaml
index 31589dac6..661e9f2b7 100644
--- a/global/software/config/versions.yaml
+++ b/global/software/config/versions.yaml
@@ -230,6 +230,11 @@ data:
         reference: 6ef48d37060e81cc6ffc283644bcd2df6c7ef80e
         subpath: prometheus-node-exporter
         type: git
+      prometheus_process_exporter:
+        location: https://git.openstack.org/openstack/openstack-helm-infra
+        reference: 922d7d3d26b15d1d25e40ba1528337002ae15b0c
+        subpath: prometheus-process-exporter
+        type: git
       prometheus_openstack_exporter:
         location: https://git.openstack.org/openstack/openstack-helm-infra
         reference: 6ef48d37060e81cc6ffc283644bcd2df6c7ef80e
@@ -528,6 +533,7 @@ data:
       prometheus_kube_state_metrics: {}
       prometheus_node_exporter: {}
       prometheus_openstack_exporter: {}
+      prometheus_process_exporter: {}
     ucp:
       armada:
         api: quay.io/airshipit/armada:32baf9851e9cd29f4b8670e73a9409a0e7a59e35
diff --git a/global/software/manifests/full-site.yaml b/global/software/manifests/full-site.yaml
index cd408a9ee..df9a8a5ca 100644
--- a/global/software/manifests/full-site.yaml
+++ b/global/software/manifests/full-site.yaml
@@ -29,6 +29,7 @@ data:
     - ucp-drydock
     - ucp-promenade
     - ucp-shipyard
+    - ucp-prometheus-openstack-exporter
     - osh-infra-ingress-controller
     - osh-infra-ceph-config
     - osh-infra-radosgw
diff --git a/site/airship-seaworthy/secrets/passphrases/osh_infra_oslo_db_exporter_password.yaml b/site/airship-seaworthy/secrets/passphrases/osh_infra_oslo_db_exporter_password.yaml
new file mode 100644
index 000000000..f134f46a9
--- /dev/null
+++ b/site/airship-seaworthy/secrets/passphrases/osh_infra_oslo_db_exporter_password.yaml
@@ -0,0 +1,11 @@
+---
+schema: deckhand/Passphrase/v1
+metadata:
+  schema: metadata/Document/v1
+  name: osh_infra_oslo_db_exporter_password
+  layeringDefinition:
+    abstract: false
+    layer: site
+  storagePolicy: cleartext
+data: password123
+...
diff --git a/site/airship-seaworthy/secrets/passphrases/osh_infra_prometheus_admin_password.yaml b/site/airship-seaworthy/secrets/passphrases/osh_infra_prometheus_admin_password.yaml
new file mode 100644
index 000000000..b3df5f659
--- /dev/null
+++ b/site/airship-seaworthy/secrets/passphrases/osh_infra_prometheus_admin_password.yaml
@@ -0,0 +1,11 @@
+---
+schema: deckhand/Passphrase/v1
+metadata:
+  schema: metadata/Document/v1
+  name: osh_infra_prometheus_admin_password
+  layeringDefinition:
+    abstract: false
+    layer: site
+  storagePolicy: cleartext
+data: password123
+...
diff --git a/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_admin_access_key.yaml b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_admin_access_key.yaml
index 7fc1eddf1..9f64719a0 100644
--- a/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_admin_access_key.yaml
+++ b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_admin_access_key.yaml
@@ -7,5 +7,5 @@ metadata:
     abstract: false
     layer: site
   storagePolicy: cleartext
-data: password123
+data: admin_access_key
 ...
diff --git a/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_admin_secret_key.yaml b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_admin_secret_key.yaml
index 32f7d80f5..3e06f913a 100644
--- a/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_admin_secret_key.yaml
+++ b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_admin_secret_key.yaml
@@ -7,5 +7,5 @@ metadata:
     abstract: false
     layer: site
   storagePolicy: cleartext
-data: password123
+data: admin_secret_key
 ...
diff --git a/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_elasticsearch_access_key.yaml b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_elasticsearch_access_key.yaml
index befc16e1f..97c7d2312 100644
--- a/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_elasticsearch_access_key.yaml
+++ b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_elasticsearch_access_key.yaml
@@ -7,5 +7,5 @@ metadata:
     abstract: false
     layer: site
   storagePolicy: cleartext
-data: password123
+data: elastic_access_key
 ...
diff --git a/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_elasticsearch_secret_key.yaml b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_elasticsearch_secret_key.yaml
index 6dff56e51..60f0134e0 100644
--- a/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_elasticsearch_secret_key.yaml
+++ b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_elasticsearch_secret_key.yaml
@@ -7,5 +7,5 @@ metadata:
     abstract: false
     layer: site
   storagePolicy: cleartext
-data: password123
+data: elastic_secret_key
 ...
diff --git a/site/airship-seaworthy/secrets/passphrases/osh_oslo_db_exporter_password.yaml b/site/airship-seaworthy/secrets/passphrases/osh_oslo_db_exporter_password.yaml
new file mode 100644
index 000000000..61b4144ad
--- /dev/null
+++ b/site/airship-seaworthy/secrets/passphrases/osh_oslo_db_exporter_password.yaml
@@ -0,0 +1,11 @@
+---
+schema: deckhand/Passphrase/v1
+metadata:
+  schema: metadata/Document/v1
+  name: osh_oslo_db_exporter_password
+  layeringDefinition:
+    abstract: false
+    layer: site
+  storagePolicy: cleartext
+data: password123
+...
diff --git a/site/airship-seaworthy/secrets/passphrases/ucp_openstack_exporter_keystone_password.yaml b/site/airship-seaworthy/secrets/passphrases/ucp_openstack_exporter_keystone_password.yaml
new file mode 100644
index 000000000..b513af431
--- /dev/null
+++ b/site/airship-seaworthy/secrets/passphrases/ucp_openstack_exporter_keystone_password.yaml
@@ -0,0 +1,11 @@
+---
+schema: deckhand/Passphrase/v1
+metadata:
+  schema: metadata/Document/v1
+  name: ucp_openstack_exporter_keystone_password
+  layeringDefinition:
+    abstract: false
+    layer: site
+  storagePolicy: cleartext
+data: password123
+...
diff --git a/site/airship-seaworthy/software/config/endpoints.yaml b/site/airship-seaworthy/software/config/endpoints.yaml
index 456dbd02c..c94e72259 100644
--- a/site/airship-seaworthy/software/config/endpoints.yaml
+++ b/site/airship-seaworthy/software/config/endpoints.yaml
@@ -260,6 +260,19 @@ data:
         default: "http"
       host_fqdn_override:
         default: null
+    prometheus_openstack_exporter:
+      namespace: ucp
+      hosts:
+        default: openstack-metrics
+      host_fqdn_override:
+        default: null
+      path:
+        default: null
+      scheme:
+        default: "http"
+      port:
+        exporter:
+          default: 9103
   ceph:
     object_store:
       name: swift
@@ -499,6 +512,19 @@ data:
           default: 3306
         wsrep:
           default: 4567
+    prometheus_mysql_exporter:
+      namespace: openstack
+      hosts:
+        default: mysql-exporter
+      host_fqdn_override:
+        default: null
+      path:
+        default: /metrics
+      scheme:
+        default: 'http'
+      port:
+        metrics:
+          default: 9104
     keystone_oslo_messaging:
       namespace: openstack
       hosts:
@@ -1098,6 +1124,19 @@ data:
       port:
         mysql:
           default: 3306
+    prometheus_mysql_exporter:
+      namespace: osh-infra
+      hosts:
+        default: mysql-exporter
+      host_fqdn_override:
+        default: null
+      path:
+        default: /metrics
+      scheme:
+        default: 'http'
+      port:
+        metrics:
+          default: 9104
     grafana:
       name: grafana
       namespace: osh-infra
@@ -1132,7 +1171,8 @@ data:
       port:
         api:
           default: 9090
-          public: 80
+        http:
+          default: 80
     kibana:
       name: kibana
       namespace: osh-infra
@@ -1209,6 +1249,19 @@ data:
           default: 9100
         prometheus_port:
           default: 9100
+    process_exporter_metrics:
+      namespace: kube-system
+      hosts:
+        default: process-exporter
+      host_fqdn_override:
+        default: null
+      path:
+        default: null
+      scheme:
+        default: "http"
+      port:
+        metrics:
+          default: 9256
     prometheus_openstack_exporter:
       namespace: openstack
       hosts:
diff --git a/site/airship-seaworthy/software/config/service_accounts.yaml b/site/airship-seaworthy/software/config/service_accounts.yaml
index a993dee13..9e6ffe99e 100644
--- a/site/airship-seaworthy/software/config/service_accounts.yaml
+++ b/site/airship-seaworthy/software/config/service_accounts.yaml
@@ -122,6 +122,14 @@ data:
             postgres:
                 username: deckhand
                 database: deckhand
+        prometheus_openstack_exporter:
+            user:
+                region_name: RegionOne
+                role: admin
+                username: prometheus-openstack-exporter
+                project_name: service
+                user_domain_name: default
+                project_domain_name: default
     ceph:
         swift:
             keystone:
@@ -303,6 +311,9 @@ data:
     oslo_db:
       admin:
         username: root
+    prometheus_mysql_exporter:
+      user:
+        username: osh-oslodb-exporter
     neutron:
       neutron:
         role: admin
@@ -406,6 +417,9 @@ data:
     oslo_db:
       admin:
         username: root
+    prometheus_mysql_exporter:
+      user:
+        username: osh-infra-oslodb-exporter
     prometheus_openstack_exporter:
       user:
         role: admin
@@ -416,6 +430,9 @@ data:
     nagios:
       admin:
         username: nagios
+    prometheus:
+      admin:
+        username: prometheus
     ldap:
       admin:
         # NEWSITE-CHANGEME: Replace with the site's LDAP account used to