From 7705bba7e239e6881207f8dfb90b17b3d0334563 Mon Sep 17 00:00:00 2001 From: Steve Wilkerson Date: Mon, 17 Sep 2018 08:13:25 -0500 Subject: [PATCH] OSH-Infra: Update monitoring chart configs This updates chart configuration overrides for the monitoring services, as well as adds missing secrets, charts and service accounts for other exporters that have now been enabled Change-Id: Ia1ed7bba38d7c262e85de8162d53012cdadf487e --- .../osh-infra-dashboards/grafana.yaml | 16 + .../osh-infra/osh-infra-mariadb/mariadb.yaml | 23 +- .../osh-infra-monitoring/chart-group.yaml | 1 + .../osh-infra-monitoring/nagios.yaml | 30 + .../prometheus-process-exporter.yaml | 65 + .../osh-infra-monitoring/prometheus.yaml | 1572 +++++++++++++++++ .../charts/osh/openstack-mariadb/mariadb.yaml | 22 + global/software/charts/ucp/ceph/ceph-mon.yaml | 2 - global/software/charts/ucp/ceph/ceph-osd.yaml | 2 - .../ucp-openstack-exporter/chart-group.yaml | 13 + .../prometheus-openstack-exporter.yaml | 95 + global/software/config/versions.yaml | 6 + global/software/manifests/full-site.yaml | 1 + .../osh_infra_oslo_db_exporter_password.yaml | 11 + .../osh_infra_prometheus_admin_password.yaml | 11 + .../osh_infra_rgw_s3_admin_access_key.yaml | 2 +- .../osh_infra_rgw_s3_admin_secret_key.yaml | 2 +- ...infra_rgw_s3_elasticsearch_access_key.yaml | 2 +- ...infra_rgw_s3_elasticsearch_secret_key.yaml | 2 +- .../osh_oslo_db_exporter_password.yaml | 11 + ..._openstack_exporter_keystone_password.yaml | 11 + .../software/config/endpoints.yaml | 55 +- .../software/config/service_accounts.yaml | 17 + 23 files changed, 1962 insertions(+), 10 deletions(-) create mode 100644 global/software/charts/osh-infra/osh-infra-monitoring/prometheus-process-exporter.yaml create mode 100644 global/software/charts/ucp/ucp-openstack-exporter/chart-group.yaml create mode 100644 global/software/charts/ucp/ucp-openstack-exporter/prometheus-openstack-exporter.yaml create mode 100644 site/airship-seaworthy/secrets/passphrases/osh_infra_oslo_db_exporter_password.yaml create mode 100644 site/airship-seaworthy/secrets/passphrases/osh_infra_prometheus_admin_password.yaml create mode 100644 site/airship-seaworthy/secrets/passphrases/osh_oslo_db_exporter_password.yaml create mode 100644 site/airship-seaworthy/secrets/passphrases/ucp_openstack_exporter_keystone_password.yaml diff --git a/global/software/charts/osh-infra/osh-infra-dashboards/grafana.yaml b/global/software/charts/osh-infra/osh-infra-dashboards/grafana.yaml index 48e36e23a..17dd92d06 100644 --- a/global/software/charts/osh-infra/osh-infra-dashboards/grafana.yaml +++ b/global/software/charts/osh-infra/osh-infra-dashboards/grafana.yaml @@ -68,6 +68,12 @@ metadata: path: .osh_infra.grafana.oslo_db dest: path: .values.endpoints.oslo_db.auth.user + - src: + schema: pegleg/AccountCatalogue/v1 + name: osh_infra_service_accounts + path: .osh_infra.prometheus.admin + dest: + path: .values.endpoints.monitoring.auth.admin - src: schema: pegleg/AccountCatalogue/v1 name: osh_infra_service_accounts @@ -120,6 +126,12 @@ metadata: schema: deckhand/Passphrase/v1 name: osh_infra_oslo_db_admin_password path: . + - dest: + path: .values.endpoints.monitoring.auth.admin.password + src: + schema: deckhand/Passphrase/v1 + name: osh_infra_prometheus_admin_password + path: . # LDAP Configuration Details - src: @@ -211,6 +223,10 @@ data: node_selector_key: openstack-control-plane node_selector_value: enabled conf: + provisioning: + datasources: + monitoring: + url: http://prom-metrics.osh-infra.svc.cluster.local:80/ ldap: config: base_dns: diff --git a/global/software/charts/osh-infra/osh-infra-mariadb/mariadb.yaml b/global/software/charts/osh-infra/osh-infra-mariadb/mariadb.yaml index 310cfec33..1d90c3ccd 100644 --- a/global/software/charts/osh-infra/osh-infra-mariadb/mariadb.yaml +++ b/global/software/charts/osh-infra/osh-infra-mariadb/mariadb.yaml @@ -31,6 +31,13 @@ metadata: path: .osh_infra.oslo_db dest: path: .values.endpoints.olso_db + - src: + schema: pegleg/EndpointCatalogue/v1 + name: osh_infra_endpoints + path: .osh_infra.prometheus_mysql_exporter + dest: + path: .values.endpoints.prometheus_mysql_exporter + # Accounts - src: schema: pegleg/AccountCatalogue/v1 @@ -38,6 +45,12 @@ metadata: path: .osh_infra.oslo_db.admin dest: path: .values.endpoints.oslo_db.auth.admin + - src: + schema: pegleg/AccountCatalogue/v1 + name: osh_infra_service_accounts + path: .osh_infra.prometheus_mysql_exporter.user + dest: + path: .values.endpoints.prometheus_mysql_exporter.auth.user # Secrets - dest: @@ -46,7 +59,12 @@ metadata: schema: deckhand/Passphrase/v1 name: osh_infra_oslo_db_admin_password path: . - + - dest: + path: .values.endpoints.oslo_db.auth.exporter.password + src: + schema: deckhand/Passphrase/v1 + name: osh_infra_oslo_db_exporter_password + path: . data: chart_name: osh-infra-mariadb release: osh-infra-mariadb @@ -72,6 +90,9 @@ data: prometheus_mysql_exporter: node_selector_key: openstack-control-plane node_selector_value: enabled + monitoring: + prometheus: + enabled: true dependencies: - osh-helm-toolkit ... diff --git a/global/software/charts/osh-infra/osh-infra-monitoring/chart-group.yaml b/global/software/charts/osh-infra/osh-infra-monitoring/chart-group.yaml index 4cb879cd4..be06ca8ad 100644 --- a/global/software/charts/osh-infra/osh-infra-monitoring/chart-group.yaml +++ b/global/software/charts/osh-infra/osh-infra-monitoring/chart-group.yaml @@ -13,5 +13,6 @@ data: - prometheus - prometheus-alertmanager - prometheus-node-exporter + - prometheus-process-exporter - prometheus-kube-state-metrics - nagios diff --git a/global/software/charts/osh-infra/osh-infra-monitoring/nagios.yaml b/global/software/charts/osh-infra/osh-infra-monitoring/nagios.yaml index 35ff41b3e..ee7122e8e 100644 --- a/global/software/charts/osh-infra/osh-infra-monitoring/nagios.yaml +++ b/global/software/charts/osh-infra/osh-infra-monitoring/nagios.yaml @@ -37,6 +37,12 @@ metadata: path: .osh_infra.monitoring dest: path: .values.endpoints.monitoring + - src: + schema: pegleg/EndpointCatalogue/v1 + name: osh_infra_endpoints + path: .osh_infra.elasticsearch + dest: + path: .values.endpoints.elasticsearch - src: schema: pegleg/EndpointCatalogue/v1 name: osh_infra_endpoints @@ -51,6 +57,18 @@ metadata: path: .osh_infra.nagios.admin dest: path: .values.endpoints.nagios.auth.admin + - src: + schema: pegleg/AccountCatalogue/v1 + name: osh_infra_service_accounts + path: .osh_infra.prometheus.admin + dest: + path: .values.endpoints.monitoring.auth.admin + - src: + schema: pegleg/AccountCatalogue/v1 + name: osh_infra_service_accounts + path: .osh_infra.elasticsearch.admin + dest: + path: .values.endpoints.elasticsearch.auth.admin # Secrets - dest: @@ -59,6 +77,18 @@ metadata: schema: deckhand/Passphrase/v1 name: osh_infra_nagios_admin_password path: . + - dest: + path: .values.endpoints.elasticsearch.auth.admin.password + src: + schema: deckhand/Passphrase/v1 + name: osh_infra_elasticsearch_admin_password + path: . + - dest: + path: .values.endpoints.monitoring.auth.admin.password + src: + schema: deckhand/Passphrase/v1 + name: osh_infra_prometheus_admin_password + path: . # LDAP Details - src: diff --git a/global/software/charts/osh-infra/osh-infra-monitoring/prometheus-process-exporter.yaml b/global/software/charts/osh-infra/osh-infra-monitoring/prometheus-process-exporter.yaml new file mode 100644 index 000000000..d64e8564e --- /dev/null +++ b/global/software/charts/osh-infra/osh-infra-monitoring/prometheus-process-exporter.yaml @@ -0,0 +1,65 @@ +--- +schema: armada/Chart/v1 +metadata: + schema: metadata/Document/v1 + name: prometheus-process-exporter + layeringDefinition: + abstract: false + layer: global + storagePolicy: cleartext + substitutions: + # Chart source + - src: + schema: pegleg/SoftwareVersions/v1 + name: software-versions + path: .charts.osh_infra.prometheus_process_exporter + dest: + path: .source + + # Images + - src: + schema: pegleg/SoftwareVersions/v1 + name: software-versions + path: .images.osh_infra.prometheus_process_exporter + dest: + path: .values.images.tags + + # Endpoints + - src: + schema: pegleg/EndpointCatalogue/v1 + name: osh_infra_endpoints + path: .osh_infra.process_exporter_metrics + dest: + path: .values.endpoints.process_exporter_metrics + +data: + chart_name: prometheus-process-exporter + release: prometheus-process-exporter + namespace: kube-system + wait: + timeout: 900 + labels: + release_group: airship-prometheus-process-exporter + install: + no_hooks: false + upgrade: + no_hooks: false + pre: + delete: + - type: job + labels: + release_group: airship-prometheus-process-exporter + create: [] + post: + create: [] + values: + labels: + node_exporter: + node_selector_key: node-exporter + node_selector_value: enabled + job: + node_selector_key: openstack-control-plane + node_selector_value: enabled + dependencies: + - osh-infra-helm-toolkit +... diff --git a/global/software/charts/osh-infra/osh-infra-monitoring/prometheus.yaml b/global/software/charts/osh-infra/osh-infra-monitoring/prometheus.yaml index 9674e8897..bfde817a9 100644 --- a/global/software/charts/osh-infra/osh-infra-monitoring/prometheus.yaml +++ b/global/software/charts/osh-infra/osh-infra-monitoring/prometheus.yaml @@ -37,6 +37,42 @@ metadata: path: .osh_infra.alerts dest: path: .values.endpoints.alerts + - src: + schema: pegleg/EndpointCatalogue/v1 + name: osh_infra_endpoints + path: .osh_infra.ldap + dest: + path: .values.endpoints.ldap + + # Accounts + - src: + schema: pegleg/AccountCatalogue/v1 + name: osh_infra_service_accounts + path: .osh_infra.prometheus.admin + dest: + path: .values.endpoints.prometheus.auth.admin + + # Secrets + - dest: + path: .values.endpoints.prometheus.auth.admin.password + src: + schema: deckhand/Passphrase/v1 + name: osh_infra_prometheus_admin_password + path: . + + # LDAP Details + - src: + schema: pegleg/AccountCatalogue/v1 + name: osh_infra_service_accounts + path: .osh_infra.ldap.admin + dest: + path: .values.endpoints.ldap.auth.admin + - dest: + path: .values.endpoints.ldap.auth.admin.password + src: + schema: deckhand/Passphrase/v1 + name: osh_keystone_ldap_password + path: . data: chart_name: prometheus @@ -72,9 +108,1545 @@ data: pod: replicas: prometheus: 3 + resources: + enabled: true + prometheus: + limits: + memory: "64Gi" + cpu: "4000m" + requests: + memory: "16Gi" + cpu: "2000m" storage: requests: storage: 500Gi + conf: + prometheus: + command_line_flags: + storage.tsdb.max_block_duration: 17h + scrape_configs: + global: + scrape_interval: 60s + evaluation_interval: 60s + scrape_configs: + # NOTE(srwilkers): The job definition for Prometheus should always be + # listed first, so we can inject the basic auth username and password + # via the endpoints section + - job_name: 'prometheus-metrics' + kubernetes_sd_configs: + - role: endpoints + scrape_interval: 60s + relabel_configs: + - source_labels: + - __meta_kubernetes_service_name + action: keep + regex: "prom-metrics" + - source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape + action: keep + regex: true + - source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + action: replace + target_label: __address__ + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + action: replace + target_label: kubernetes_namespace + - source_labels: + - __meta_kubernetes_service_name + action: replace + target_label: instance + - source_labels: + - __meta_kubernetes_service_name + action: replace + target_label: kubernetes_name + - source_labels: + - __meta_kubernetes_service_name + target_label: job + replacement: ${1} + - job_name: kubelet + scheme: https + # This TLS & bearer token file config is used to connect to the actual scrape + # endpoints for cluster components. This is separate to discovery auth + # configuration because discovery & scraping are two separate concerns in + # Prometheus. The discovery auth config is automatic if Prometheus runs inside + # the cluster. Otherwise, more config options have to be provided within the + # . + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + scrape_interval: 45s + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: + - __meta_kubernetes_node_name + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics + - source_labels: + - __meta_kubernetes_node_name + action: replace + target_label: kubernetes_io_hostname + # Scrape config for Kubelet cAdvisor. + # + # This is required for Kubernetes 1.7.3 and later, where cAdvisor metrics + # (those whose names begin with 'container_') have been removed from the + # Kubelet metrics endpoint. This job scrapes the cAdvisor endpoint to + # retrieve those metrics. + # + # In Kubernetes 1.7.0-1.7.2, these metrics are only exposed on the cAdvisor + # HTTP endpoint; use "replacement: /api/v1/nodes/${1}:4194/proxy/metrics" + # in that case (and ensure cAdvisor's HTTP server hasn't been disabled with + # the --cadvisor-port=0 Kubelet flag). + # + # This job is not necessary and should be removed in Kubernetes 1.6 and + # earlier versions, or it will cause the metrics to be scraped twice. + - job_name: 'kubernetes-cadvisor' + + # Default to scraping over https. If required, just disable this or change to + # `http`. + scheme: https + + # This TLS & bearer token file config is used to connect to the actual scrape + # endpoints for cluster components. This is separate to discovery auth + # configuration because discovery & scraping are two separate concerns in + # Prometheus. The discovery auth config is automatic if Prometheus runs inside + # the cluster. Otherwise, more config options have to be provided within the + # . + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + kubernetes_sd_configs: + - role: node + + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: + - __meta_kubernetes_node_name + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor + metric_relabel_configs: + - source_labels: + - __name__ + regex: 'container_network_tcp_usage_total' + action: drop + - source_labels: + - __name__ + regex: 'container_tasks_state' + action: drop + - source_labels: + - __name__ + regex: 'container_network_udp_usage_total' + action: drop + - source_labels: + - __name__ + regex: 'container_memory_failures_total' + action: drop + - source_labels: + - __name__ + regex: 'container_cpu_load_average_10s' + action: drop + - source_labels: + - __name__ + regex: 'container_cpu_system_seconds_total' + action: drop + - source_labels: + - __name__ + regex: 'container_cpu_user_seconds_total' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_inodes_free' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_inodes_total' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_io_current' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_io_time_seconds_total' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_io_time_weighted_seconds_total' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_read_seconds_total' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_reads_merged_total' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_reads_merged_total' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_reads_total' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_sector_reads_total' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_sector_writes_total' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_write_seconds_total' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_writes_bytes_total' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_writes_merged_total' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_writes_total' + action: drop + - source_labels: + - __name__ + regex: 'container_last_seen' + action: drop + - source_labels: + - __name__ + regex: 'container_memory_cache' + action: drop + - source_labels: + - __name__ + regex: 'container_memory_failcnt' + action: drop + - source_labels: + - __name__ + regex: 'container_memory_max_usage_bytes' + action: drop + - source_labels: + - __name__ + regex: 'container_memory_rss' + action: drop + - source_labels: + - __name__ + regex: 'container_memory_swap' + action: drop + - source_labels: + - __name__ + regex: 'container_memory_usage_bytes' + action: drop + - source_labels: + - __name__ + regex: 'container_network_receive_errors_total' + action: drop + - source_labels: + - __name__ + regex: 'container_network_receive_packets_dropped_total' + action: drop + - source_labels: + - __name__ + regex: 'container_network_receive_packets_total' + action: drop + - source_labels: + - __name__ + regex: 'container_network_transmit_errors_total' + action: drop + - source_labels: + - __name__ + regex: 'container_network_transmit_packets_dropped_total' + action: drop + - source_labels: + - __name__ + regex: 'container_network_transmit_packets_total' + action: drop + - source_labels: + - __name__ + regex: 'container_spec_cpu_period' + action: drop + - source_labels: + - __name__ + regex: 'container_spec_cpu_shares' + action: drop + - source_labels: + - __name__ + regex: 'container_spec_memory_limit_bytes' + action: drop + - source_labels: + - __name__ + regex: 'container_spec_memory_reservation_limit_bytes' + action: drop + - source_labels: + - __name__ + regex: 'container_spec_memory_swap_limit_bytes' + action: drop + - source_labels: + - __name__ + regex: 'container_start_time_seconds' + action: drop + # Scrape config for API servers. + # + # Kubernetes exposes API servers as endpoints to the default/kubernetes + # service so this uses `endpoints` role and uses relabelling to only keep + # the endpoints associated with the default/kubernetes service using the + # default named port `https`. This works for single API server deployments as + # well as HA API server deployments. + - job_name: 'apiserver' + kubernetes_sd_configs: + - role: endpoints + scrape_interval: 45s + # Default to scraping over https. If required, just disable this or change to + # `http`. + scheme: https + # This TLS & bearer token file config is used to connect to the actual scrape + # endpoints for cluster components. This is separate to discovery auth + # configuration because discovery & scraping are two separate concerns in + # Prometheus. The discovery auth config is automatic if Prometheus runs inside + # the cluster. Otherwise, more config options have to be provided within the + # . + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # If your node certificates are self-signed or use a different CA to the + # master CA, then disable certificate verification below. Note that + # certificate verification is an integral part of a secure infrastructure + # so this should only be disabled in a controlled environment. You can + # disable certificate verification by uncommenting the line below. + # + # insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + # Keep only the default/kubernetes service endpoints for the https port. This + # will add targets for each API server which Kubernetes adds an endpoint to + # the default/kubernetes service. + relabel_configs: + - source_labels: + - __meta_kubernetes_namespace + - __meta_kubernetes_service_name + - __meta_kubernetes_endpoint_port_name + action: keep + regex: default;kubernetes;https + metric_relabel_configs: + - source_labels: + - __name__ + regex: 'apiserver_admission_controller_admission_latencies_seconds_bucket' + action: drop + - source_labels: + - __name__ + regex: 'rest_client_request_latency_seconds_bucket' + action: drop + - source_labels: + - __name__ + regex: 'apiserver_response_sizes_bucket' + action: drop + - source_labels: + - __name__ + regex: 'apiserver_admission_step_admission_latencies_seconds_bucket' + action: drop + - source_labels: + - __name__ + regex: 'apiserver_admission_controller_admission_latencies_seconds_count' + action: drop + - source_labels: + - __name__ + regex: 'apiserver_admission_controller_admission_latencies_seconds_sum' + action: drop + - source_labels: + - __name__ + regex: 'apiserver_request_latencies_summary' + action: drop + # Scrape config for service endpoints. + # + # The relabeling allows the actual service scrape endpoint to be configured + # via the following annotations: + # + # * `prometheus.io/scrape`: Only scrape services that have a value of `true` + # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need + # to set this to `https` & most likely set the `tls_config` of the scrape config. + # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. + # * `prometheus.io/port`: If the metrics are exposed on a different port to the + # service then set this appropriately. + - job_name: 'openstack-exporter' + kubernetes_sd_configs: + - role: endpoints + scrape_interval: 60s + relabel_configs: + - source_labels: + - __meta_kubernetes_service_name + action: keep + regex: "openstack-metrics" + - source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape + action: keep + regex: true + - source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + action: replace + target_label: __address__ + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + action: replace + target_label: kubernetes_namespace + - source_labels: + - __meta_kubernetes_service_name + action: replace + target_label: instance + - source_labels: + - __meta_kubernetes_service_name + action: replace + target_label: kubernetes_name + - source_labels: + - __meta_kubernetes_service_name + target_label: job + replacement: ${1} + - job_name: 'kubernetes-service-endpoints' + kubernetes_sd_configs: + - role: endpoints + scrape_interval: 60s + relabel_configs: + - source_labels: + - __meta_kubernetes_service_name + action: drop + regex: '(openstack-metrics|prom-metrics)' + - source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape + action: keep + regex: true + - source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + action: replace + target_label: __address__ + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + action: replace + target_label: kubernetes_namespace + - source_labels: + - __meta_kubernetes_service_name + action: replace + target_label: kubernetes_name + - source_labels: + - __meta_kubernetes_service_name + target_label: job + replacement: ${1} + # Example scrape config for pods + # + # The relabeling allows the actual pod scrape endpoint to be configured via the + # following annotations: + # + # * `prometheus.io/scrape`: Only scrape pods that have a value of `true` + # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. + # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the + # pod's declared ports (default is a port-free target if none are declared). + - job_name: 'kubernetes-pods' + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: kubernetes_pod_name + - job_name: calico-etcd + kubernetes_sd_configs: + - role: service + scrape_interval: 20s + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: keep + source_labels: + - __meta_kubernetes_service_name + regex: "calico-etcd" + - action: keep + source_labels: + - __meta_kubernetes_namespace + regex: kube-system + target_label: namespace + - source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - source_labels: + - __meta_kubernetes_service_name + target_label: service + - source_labels: + - __meta_kubernetes_service_name + target_label: job + replacement: ${1} + - source_labels: + - __meta_kubernetes_service_label + target_label: job + regex: calico-etcd + replacement: ${1} + - target_label: endpoint + replacement: "calico-etcd" + alerting: + alertmanagers: + - kubernetes_sd_configs: + - role: pod + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + relabel_configs: + - source_labels: [__meta_kubernetes_pod_label_application] + regex: alertmanager + action: keep + - source_labels: [__meta_kubernetes_pod_container_port_name] + regex: alerts-api + action: keep + - source_labels: [__meta_kubernetes_pod_container_port_name] + regex: peer-mesh + action: drop + rules: + alertmanager: + groups: + - name: alertmanager.rules + rules: + - alert: AlertmanagerConfigInconsistent + expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 + for: 5m + labels: + severity: critical + annotations: + description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync. + summary: Alertmanager configurations are inconsistent + - alert: AlertmanagerDownOrMissing + expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 + for: 5m + labels: + severity: warning + annotations: + description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery. + summary: Alertmanager down or not discovered + - alert: FailedReload + expr: alertmanager_config_last_reload_successful == 0 + for: 10m + labels: + severity: warning + annotations: + description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. + summary: Alertmanager configuration reload has failed + etcd3: + groups: + - name: etcd3.rules + rules: + - alert: etcd_InsufficientMembers + expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) + for: 3m + labels: + severity: critical + annotations: + description: If one more etcd member goes down the cluster will be unavailable + summary: etcd cluster insufficient members + - alert: etcd_NoLeader + expr: etcd_server_has_leader{job="etcd"} == 0 + for: 1m + labels: + severity: critical + annotations: + description: etcd member {{ $labels.instance }} has no leader + summary: etcd member has no leader + - alert: etcd_HighNumberOfLeaderChanges + expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour + summary: a high number of leader changes within the etcd cluster are happening + - alert: etcd_HighNumberOfFailedGRPCRequests + expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}' + summary: a high number of gRPC requests are failing + - alert: etcd_HighNumberOfFailedGRPCRequests + expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05 + for: 5m + labels: + severity: critical + annotations: + description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}' + summary: a high number of gRPC requests are failing + - alert: etcd_GRPCRequestsSlow + expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15 + for: 10m + labels: + severity: critical + annotations: + description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow + summary: slow gRPC requests + - alert: etcd_HighNumberOfFailedHTTPRequests + expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.01 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}' + summary: a high number of HTTP requests are failing + - alert: etcd_HighNumberOfFailedHTTPRequests + expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.05 + for: 5m + labels: + severity: critical + annotations: + description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}' + summary: a high number of HTTP requests are failing + - alert: etcd_HTTPRequestsSlow + expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 + for: 10m + labels: + severity: warning + annotations: + description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow + summary: slow HTTP requests + - alert: etcd_EtcdMemberCommunicationSlow + expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow + summary: etcd member communication is slow + - alert: etcd_HighNumberOfFailedProposals + expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour + summary: a high number of proposals within the etcd cluster are failing + - alert: etcd_HighFsyncDurations + expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} fync durations are high + summary: high fsync durations + - alert: etcd_HighCommitDurations + expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} commit durations are high + summary: high commit durations + kube_apiserver: + groups: + - name: kube-apiserver.rules + rules: + - alert: K8SApiserverDown + expr: absent(up{job="apiserver"} == 1) + for: 5m + labels: + severity: critical + annotations: + description: Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery. + summary: API server unreachable + - alert: K8SApiServerLatency + expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) WITHOUT (instance, resource)) / 1e+06 > 1 + for: 10m + labels: + severity: warning + annotations: + description: 99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s. + summary: Kubernetes apiserver latency is high + kube_controller_manager: + groups: + - name: kube-controller-manager.rules + rules: + - alert: K8SControllerManagerDown + expr: absent(up{job="kube-controller-manager-discovery"} == 1) + for: 5m + labels: + severity: critical + annotations: + description: There is no running K8S controller manager. Deployments and replication controllers are not making progress. + runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager + summary: Controller manager is down + kubelet: + groups: + - name: kubelet.rules + rules: + - alert: K8SNodeNotReady + expr: kube_node_status_ready{condition="true"} == 0 + for: 1h + labels: + severity: warning + annotations: + description: The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour + summary: Node status is NotReady + - alert: K8SManyNodesNotReady + expr: count(kube_node_status_ready{condition="true"} == 0) > 1 and (count(kube_node_status_ready{condition="true"} == 0) / count(kube_node_status_ready{condition="true"})) > 0.2 + for: 1m + labels: + severity: critical + annotations: + description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).' + summary: Many Kubernetes nodes are Not Ready + - alert: K8SKubeletDown + expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 + for: 1h + labels: + severity: warning + annotations: + description: Prometheus failed to scrape {{ $value }}% of kubelets. + summary: Many Kubelets cannot be scraped + - alert: K8SKubeletDown + expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1 + for: 1h + labels: + severity: critical + annotations: + description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery. + summary: Many Kubelets cannot be scraped + - alert: K8SKubeletTooManyPods + expr: kubelet_running_pod_count > 100 + labels: + severity: warning + annotations: + description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110 + summary: Kubelet is close to pod limit + kubernetes: + groups: + - name: kubernetes.rules + rules: + - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes + expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:spec_cpu_shares + expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:cpu_usage:rate + expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:memory_usage:bytes + expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:memory_working_set:bytes + expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:memory_rss:bytes + expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:memory_cache:bytes + expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:disk_usage:bytes + expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:memory_pagefaults:rate + expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type) + - record: cluster_namespace_controller_pod_container:memory_oom:rate + expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type) + - record: cluster:memory_allocation:percent + expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster) + - record: cluster:memory_used:percent + expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster) + - record: cluster:cpu_allocation:percent + expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"} * ON(cluster, instance) machine_cpu_cores) BY (cluster) + - record: cluster:node_cpu_use:percent + expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores) BY (cluster) + - record: cluster_resource_verb:apiserver_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster_resource_verb:apiserver_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster_resource_verb:apiserver_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_binding_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_binding_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_binding_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" + - alert: kube_statefulset_replicas_unavailable + expr: kube_statefulset_status_replicas < kube_statefulset_replicas + for: 5m + labels: + severity: page + annotations: + description: 'statefulset {{$labels.statefulset}} has {{$value}} replicas, which is less than desired' + summary: '{{$labels.statefulset}}: has inssuficient replicas.' + - alert: kube_daemonsets_misscheduled + expr: kube_daemonset_status_number_misscheduled > 0 + for: 10m + labels: + severity: warning + annotations: + description: 'Daemonset {{$labels.daemonset}} is running where it is not supposed to run' + summary: 'Daemonsets not scheduled correctly' + - alert: kube_daemonsets_not_scheduled + expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number' + summary: 'Less than desired number of daemonsets scheduled' + - alert: kube_deployment_replicas_unavailable + expr: kube_deployment_status_replicas_unavailable > 0 + for: 10m + labels: + severity: page + annotations: + description: 'deployment {{$labels.deployment}} has {{$value}} replicas unavailable' + summary: '{{$labels.deployment}}: has inssuficient replicas.' + - alert: kube_rollingupdate_deployment_replica_less_than_spec_max_unavailable + expr: kube_deployment_status_replicas_available - kube_deployment_spec_strategy_rollingupdate_max_unavailable < 0 + for: 10m + labels: + severity: page + annotations: + description: 'deployment {{$labels.deployment}} has {{$value}} replicas available which is less than specified as max unavailable during a rolling update' + summary: '{{$labels.deployment}}: has inssuficient replicas during a rolling update.' + - alert: kube_job_status_failed + expr: kube_job_status_failed > 0 + for: 10m + labels: + severity: page + annotations: + description: 'Job {{$labels.exported_job}} is in failed status' + summary: '{{$labels.exported_job}} has failed status' + - alert: kube_pod_status_pending + expr: kube_pod_status_phase{phase="Pending"} == 1 + for: 10m + labels: + severity: page + annotations: + description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has been in pending status for more than 10 minutes' + summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in pending status' + - alert: kube_pod_error_image_pull + expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1 + for: 10m + labels: + severity: page + annotations: + description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes' + summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status' + - alert: kube_pod_status_error_image_pull + expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1 + for: 10m + labels: + severity: page + annotations: + description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes' + summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status' + - alert: kube_replicaset_missing_replicas + expr: kube_replicaset_spec_replicas - kube_replicaset_status_ready_replicas > 0 + for: 10m + labels: + severity: page + annotations: + description: 'Replicaset {{$labels.replicaset}} is missing desired number of replicas for more than 10 minutes' + summary: 'Replicaset {{$labels.replicaset}} is missing replicas' + - alert: kube_pod_container_terminated + expr: kube_pod_container_status_terminated_reason{reason=~"OOMKilled|Error|ContainerCannotRun"} > 0 + for: 10m + labels: + severity: page + annotations: + description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a container terminated for more than 10 minutes' + summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status' + - alert: volume_claim_capacity_high_utilization + expr: (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.80 + for: 5m + labels: + severity: page + annotations: + description: 'volume claim {{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity' + summary: '{{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity.' + basic_linux: + groups: + - name: basic_linux.rules + rules: + - alert: node_filesystem_full_80percent + expr: sort(node_filesystem_free{device!="ramfs"} < node_filesystem_size{device!="ramfs"} + * 0.2) / 1024 ^ 3 + for: 5m + labels: + severity: page + annotations: + description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} + got less than 10% space left on its filesystem.' + summary: '{{$labels.alias}}: Filesystem is running out of space soon.' + - alert: node_filesystem_full_in_4h + expr: predict_linear(node_filesystem_free{device!="ramfs"}[1h], 4 * 3600) <= 0 + for: 5m + labels: + severity: page + annotations: + description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} + is running out of space of in approx. 4 hours' + summary: '{{$labels.alias}}: Filesystem is running out of space in 4 hours.' + - alert: node_filedescriptors_full_in_3h + expr: predict_linear(node_filefd_allocated[1h], 3 * 3600) >= node_filefd_maximum + for: 20m + labels: + severity: page + annotations: + description: '{{$labels.alias}} is running out of available file descriptors + in approx. 3 hours' + summary: '{{$labels.alias}} is running out of available file descriptors in + 3 hours.' + - alert: node_load1_90percent + expr: node_load1 / ON(alias) count(node_cpu{mode="system"}) BY (alias) >= 0.9 + for: 1h + labels: + severity: page + annotations: + description: '{{$labels.alias}} is running with > 90% total load for at least + 1h.' + summary: '{{$labels.alias}}: Running on high load.' + - alert: node_cpu_util_90percent + expr: 100 - (avg(irate(node_cpu{mode="idle"}[5m])) BY (alias) * 100) >= 90 + for: 1h + labels: + severity: page + annotations: + description: '{{$labels.alias}} has total CPU utilization over 90% for at least + 1h.' + summary: '{{$labels.alias}}: High CPU utilization.' + - alert: node_ram_using_90percent + expr: node_memory_MemFree + node_memory_Buffers + node_memory_Cached < node_memory_MemTotal + * 0.1 + for: 30m + labels: + severity: page + annotations: + description: '{{$labels.alias}} is using at least 90% of its RAM for at least + 30 minutes now.' + summary: '{{$labels.alias}}: Using lots of RAM.' + - alert: node_swap_using_80percent + expr: node_memory_SwapTotal - (node_memory_SwapFree + node_memory_SwapCached) + > node_memory_SwapTotal * 0.8 + for: 10m + labels: + severity: page + annotations: + description: '{{$labels.alias}} is using 80% of its swap space for at least + 10 minutes now.' + summary: '{{$labels.alias}}: Running out of swap soon.' + - alert: node_high_cpu_load + expr: node_load15 / on(alias) count(node_cpu{mode="system"}) by (alias) >= 0 + for: 1m + labels: + severity: warning + annotations: + description: '{{$labels.alias}} is running with load15 > 1 for at least 5 minutes: {{$value}}' + summary: '{{$labels.alias}}: Running on high load: {{$value}}' + - alert: node_high_memory_load + expr: (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers + + node_memory_Cached)) / sum(node_memory_MemTotal) * 100 > 85 + for: 1m + labels: + severity: warning + annotations: + description: Host memory usage is {{ humanize $value }}%. Reported by + instance {{ $labels.instance }} of job {{ $labels.job }}. + summary: Server memory is almost full + - alert: node_high_storage_load + expr: (node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}) + / node_filesystem_size{mountpoint="/"} * 100 > 85 + for: 30s + labels: + severity: warning + annotations: + description: Host storage usage is {{ humanize $value }}%. Reported by + instance {{ $labels.instance }} of job {{ $labels.job }}. + summary: Server storage is almost full + - alert: node_high_swap + expr: (node_memory_SwapTotal - node_memory_SwapFree) < (node_memory_SwapTotal + * 0.4) + for: 1m + labels: + severity: warning + annotations: + description: Host system has a high swap usage of {{ humanize $value }}. Reported + by instance {{ $labels.instance }} of job {{ $labels.job }}. + summary: Server has a high swap usage + - alert: node_high_network_drop_rcv + expr: node_network_receive_drop{device!="lo"} > 3000 + for: 30s + labels: + severity: warning + annotations: + description: Host system has an unusally high drop in network reception ({{ + humanize $value }}). Reported by instance {{ $labels.instance }} of job {{ + $labels.job }} + summary: Server has a high receive drop + - alert: node_high_network_drop_send + expr: node_network_transmit_drop{device!="lo"} > 3000 + for: 30s + labels: + severity: warning + annotations: + description: Host system has an unusally high drop in network transmission ({{ + humanize $value }}). Reported by instance {{ $labels.instance }} of job {{ + $labels.job }} + summary: Server has a high transmit drop + - alert: node_high_network_errs_rcv + expr: node_network_receive_errs{device!="lo"} > 3000 + for: 30s + labels: + severity: warning + annotations: + description: Host system has an unusally high error rate in network reception + ({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job + {{ $labels.job }} + summary: Server has unusual high reception errors + - alert: node_high_network_errs_send + expr: node_network_transmit_errs{device!="lo"} > 3000 + for: 30s + labels: + severity: warning + annotations: + description: Host system has an unusally high error rate in network transmission + ({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job + {{ $labels.job }} + summary: Server has unusual high transmission errors + - alert: node_network_conntrack_usage_80percent + expr: sort(node_nf_conntrack_entries{job="node-exporter"} > node_nf_conntrack_entries_limit{job="node-exporter"} * 0.8) + for: 5m + labels: + severity: page + annotations: + description: '{{$labels.instance}} has network conntrack entries of {{ $value }} which is more than 80% of maximum limit' + summary: '{{$labels.instance}}: available network conntrack entries are low.' + - alert: node_entropy_available_low + expr: node_entropy_available_bits < 300 + for: 5m + labels: + severity: page + annotations: + description: '{{$labels.instance}} has available entropy bits of {{ $value }} which is less than required of 300' + summary: '{{$labels.instance}}: is low on entropy bits.' + - alert: node_hwmon_high_cpu_temp + expr: node_hwmon_temp_crit_celsius*0.9 - node_hwmon_temp_celsius < 0 OR node_hwmon_temp_max_celsius*0.95 - node_hwmon_temp_celsius < 0 + for: 5m + labels: + severity: page + annotations: + description: '{{$labels.alias}} reports hwmon sensor {{$labels.sensor}}/{{$labels.chip}} temperature value is nearly critical: {{$value}}' + summary: '{{$labels.alias}}: Sensor {{$labels.sensor}}/{{$labels.chip}} temp is high: {{$value}}' + - alert: node_vmstat_paging_rate_high + expr: irate(node_vmstat_pgpgin[5m]) > 80 + for: 5m + labels: + severity: page + annotations: + description: '{{$labels.alias}} has a memory paging rate of change higher than 80%: {{$value}}' + summary: '{{$labels.alias}}: memory paging rate is high: {{$value}}' + - alert: node_xfs_block_allocation_high + expr: 100*(node_xfs_extent_allocation_blocks_allocated_total{job="node-exporter", instance=~"172.17.0.1.*"} / (node_xfs_extent_allocation_blocks_freed_total{job="node-exporter", instance=~"172.17.0.1.*"} + node_xfs_extent_allocation_blocks_allocated_total{job="node-exporter", instance=~"172.17.0.1.*"})) > 80 + for: 5m + labels: + severity: page + annotations: + description: '{{$labels.alias}} has xfs allocation blocks higher than 80%: {{$value}}' + summary: '{{$labels.alias}}: xfs block allocation high: {{$value}}' + - alert: node_network_bond_slaves_down + expr: node_net_bonding_slaves - node_net_bonding_slaves_active > 0 + for: 5m + labels: + severity: page + annotations: + description: '{{ $labels.master }} is missing {{ $value }} slave interface(s).' + summary: 'Instance {{ $labels.instance }}: {{ $labels.master }} missing {{ $value }} slave interface(s)' + - alert: node_numa_memory_used + expr: 100*node_memory_numa_MemUsed / node_memory_numa_MemTotal > 80 + for: 5m + labels: + severity: page + annotations: + description: '{{$labels.alias}} has more than 80% NUMA memory usage: {{ $value }}' + summary: '{{$labels.alias}}: has high NUMA memory usage: {{$value}}' + - alert: node_ntp_clock_skew_high + expr: abs(node_ntp_drift_seconds) > 2 + for: 5m + labels: + severity: page + annotations: + description: '{{$labels.alias}} has time difference of more than 2 seconds compared to NTP server: {{ $value }}' + summary: '{{$labels.alias}}: time is skewed by : {{$value}} seconds' + - alert: node_disk_read_latency + expr: (rate(node_disk_read_time_ms[5m]) / rate(node_disk_reads_completed[5m])) > 10 + for: 5m + labels: + severity: page + annotations: + description: '{{$labels.device}} has a high read latency of {{ $value }}' + summary: 'High read latency observed for device {{ $labels.device }}' + - alert: node_disk_write_latency + expr: (rate(node_disk_write_time_ms[5m]) / rate(node_disk_writes_completed[5m])) > 10 + for: 5m + labels: + severity: page + annotations: + description: '{{$labels.device}} has a high write latency of {{ $value }}' + summary: 'High write latency observed for device {{ $labels.device }}' + openstack: + groups: + - name: openstack.rules + rules: + - alert: os_glance_api_availability + expr: check_glance_api != 1 + for: 5m + labels: + severity: page + annotations: + description: 'Glance API is not available at {{$labels.url}} for more than 5 minutes' + summary: 'Glance API is not available at {{$labels.url}}' + - alert: os_nova_api_availability + expr: check_nova_api != 1 + for: 5m + labels: + severity: page + annotations: + description: 'Nova API is not available at {{$labels.url}} for more than 5 minutes' + summary: 'Nova API is not available at {{$labels.url}}' + - alert: os_keystone_api_availability + expr: check_keystone_api != 1 + for: 5m + labels: + severity: page + annotations: + description: 'Keystone API is not available at {{$labels.url}} for more than 5 minutes' + summary: 'Keystone API is not available at {{$labels.url}}' + - alert: os_neutron_api_availability + expr: check_neutron_api != 1 + for: 5m + labels: + severity: page + annotations: + description: 'Neutron API is not available at {{$labels.url}} for more than 5 minutes' + summary: 'Neutron API is not available at {{$labels.url}}' + - alert: os_swift_api_availability + expr: check_swift_api != 1 + for: 5m + labels: + severity: page + annotations: + description: 'Swift API is not available at {{$labels.url}} for more than 5 minutes' + summary: 'Swift API is not available at {{$labels.url}}' + - alert: os_nova_compute_disabled + expr: services_nova_compute_disabled_total > 0 + for: 5m + labels: + severity: page + annotations: + description: 'nova-compute is disabled on certain hosts for more than 5 minutes' + summary: 'Openstack compute service nova-compute is disabled on some hosts' + - alert: os_nova_conductor_disabled + expr: services_nova_conductor_disabled_total > 0 + for: 5m + labels: + severity: page + annotations: + description: 'nova-conductor is disabled on certain hosts for more than 5 minutes' + summary: 'Openstack compute service nova-conductor is disabled on some hosts' + - alert: os_nova_consoleauth_disabled + expr: services_nova_consoleauth_disabled_total > 0 + for: 5m + labels: + severity: page + annotations: + description: 'nova-consoleauth is disabled on certain hosts for more than 5 minutes' + summary: 'Openstack compute service nova-consoleauth is disabled on some hosts' + - alert: os_nova_scheduler_disabled + expr: services_nova_scheduler_disabled_total > 0 + for: 5m + labels: + severity: page + annotations: + description: 'nova-scheduler is disabled on certain hosts for more than 5 minutes' + summary: 'Openstack compute service nova-scheduler is disabled on some hosts' + ceph: + groups: + - name: ceph.rules + rules: + - alert: ceph_monitor_quorum_low + expr: ceph_monitor_quorum_count < 3 + for: 5m + labels: + severity: page + annotations: + description: 'ceph monitor quorum has been less than 3 for more than 5 minutes' + summary: 'ceph high availability is at risk' + - alert: ceph_cluster_usage_high + expr: 100* ceph_cluster_used_bytes/ceph_cluster_capacity_bytes > 80 + for: 5m + labels: + severity: page + annotations: + description: 'ceph cluster capacity usage more than 80 percent' + summary: 'ceph cluster usage is more than 80 percent' + - alert: ceph_placement_group_degrade_pct_high + expr: 100*ceph_degraded_pgs/ceph_total_pgs > 80 + for: 5m + labels: + severity: page + annotations: + description: 'ceph placement group degradation is more than 80 percent' + summary: 'ceph placement groups degraded' + - alert: ceph_osd_down_pct_high + expr: 100* ceph_osds_down/(ceph_osds_down+ceph_osds_up) > 80 + for: 5m + labels: + severity: page + annotations: + description: 'ceph OSDs down percent is more than 80 percent' + summary: 'ceph OSDs down percent is high' + - alert: ceph_monitor_clock_skew_high + expr: ceph_monitor_clock_skew_seconds > 2 + for: 5m + labels: + severity: page + annotations: + description: 'ceph monitors clock skew on {{$labels.instance}} is more than 2 seconds' + summary: 'ceph monitor clock skew high' + fluentd: + groups: + - name: fluentd.rules + rules: + - alert: fluentd_not_running + expr: fluentd_up == 0 + for: 5m + labels: + severity: page + annotations: + description: 'fluentd is down on {{$labels.instance}} for more than 5 minutes' + summary: 'Fluentd is down' + calico: + groups: + - name: calico.rules + rules: + - alert: calico_datapane_failures_high_1h + expr: absent(felix_int_dataplane_failures) OR increase(felix_int_dataplane_failures[1h]) > 5 + labels: + severity: page + annotations: + description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} dataplane failures within the last hour' + summary: 'A high number of dataplane failures within Felix are happening' + - alert: calico_datapane_address_msg_batch_size_high_5m + expr: absent(felix_int_dataplane_addr_msg_batch_size_sum) OR absent(felix_int_dataplane_addr_msg_batch_size_count) OR (felix_int_dataplane_addr_msg_batch_size_sum/felix_int_dataplane_addr_msg_batch_size_count) > 5 + for: 5m + labels: + severity: page + annotations: + description: 'Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane address message batch size' + summary: 'Felix address message batch size is higher' + - alert: calico_datapane_iface_msg_batch_size_high_5m + expr: absent(felix_int_dataplane_iface_msg_batch_size_sum) OR absent(felix_int_dataplane_iface_msg_batch_size_count) OR (felix_int_dataplane_iface_msg_batch_size_sum/felix_int_dataplane_iface_msg_batch_size_count) > 5 + for: 5m + labels: + severity: page + annotations: + description: 'Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane interface message batch size' + summary: 'Felix interface message batch size is higher' + - alert: calico_ipset_errors_high_1h + expr: absent(felix_ipset_errors) OR increase(felix_ipset_errors[1h]) > 5 + labels: + severity: page + annotations: + description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} ipset errors within the last hour' + summary: 'A high number of ipset errors within Felix are happening' + - alert: calico_iptable_save_errors_high_1h + expr: absent(felix_iptables_save_errors) OR increase(felix_iptables_save_errors[1h]) > 5 + labels: + severity: page + annotations: + description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable save errors within the last hour' + summary: 'A high number of iptable save errors within Felix are happening' + - alert: calico_iptable_restore_errors_high_1h + expr: absent(felix_iptables_restore_errors) OR increase(felix_iptables_restore_errors[1h]) > 5 + labels: + severity: page + annotations: + description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable restore errors within the last hour' + summary: 'A high number of iptable restore errors within Felix are happening' + rabbitmq: + groups: + - name: rabbitmq.rules + rules: + - alert: rabbitmq_network_pratitions_detected + expr: min(partitions) by(instance) > 0 + for: 10m + labels: + severity: warning + annotations: + description: 'RabbitMQ at {{ $labels.instance }} has {{ $value }} partitions' + summary: 'RabbitMQ Network partitions detected' + - alert: rabbitmq_down + expr: min(rabbitmq_up) by(instance) != 1 + for: 10m + labels: + severity: page + annotations: + description: 'RabbitMQ Server instance {{ $labels.instance }} is down' + summary: 'The RabbitMQ Server instance at {{ $labels.instance }} has been down the last 10 mins' + - alert: rabbitmq_file_descriptor_usage_high + expr: fd_used * 100 /fd_total > 80 + for: 10m + labels: + severity: warning + annotations: + description: 'RabbitMQ Server instance {{ $labels.instance }} has high file descriptor usage of {{ $value }} percent.' + summary: 'RabbitMQ file descriptors usage is high for last 10 mins' + - alert: rabbitmq_node_disk_free_alarm + expr: node_disk_free_alarm > 0 + for: 10m + labels: + severity: warning + annotations: + description: 'RabbitMQ Server instance {{ $labels.instance }} has low disk free space available.' + summary: 'RabbitMQ disk space usage is high' + - alert: rabbitmq_node_memory_alarm + expr: node_mem_alarm > 0 + for: 10m + labels: + severity: warning + annotations: + description: 'RabbitMQ Server instance {{ $labels.instance }} has low free memory.' + summary: 'RabbitMQ memory usage is high' + - alert: rabbitmq_less_than_3_nodes + expr: running < 3 + for: 10m + labels: + severity: warning + annotations: + description: 'RabbitMQ Server has less than 3 nodes running.' + summary: 'RabbitMQ server is at risk of loosing data' + - alert: rabbitmq_queue_messages_returned_high + expr: queue_messages_returned_total/queue_messages_published_total * 100 > 50 + for: 5m + labels: + severity: warning + annotations: + description: 'RabbitMQ Server is returing more than 50 percent of messages received.' + summary: 'RabbitMQ server is returning more than 50 percent of messages received.' + - alert: rabbitmq_consumers_low_utilization + expr: queue_consumer_utilisation < .4 + for: 5m + labels: + severity: warning + annotations: + description: 'RabbitMQ consumers message consumption speed is low' + summary: 'RabbitMQ consumers message consumption speed is low' + - alert: rabbitmq_high_message_load + expr: queue_messages_total > 17000 or increase(queue_messages_total[5m]) > 4000 + for: 5m + labels: + severity: warning + annotations: + description: 'RabbitMQ has high message load. Total Queue depth > 17000 or growth more than 4000 messages.' + summary: 'RabbitMQ has high message load' + elasticsearch: + groups: + - name: elasticsearch.rules + rules: + - alert: es_high_process_open_files_count + expr: sum(elasticsearch_process_open_files_count) by (host) > 64000 + for: 10m + labels: + severity: warning + annotations: + description: 'Elasticsearch at {{ $labels.host }} has more than 64000 process open file count.' + summary: 'Elasticsearch has a very high process open file count.' + - alert: es_high_process_cpu_percent + expr: elasticsearch_process_cpu_percent > 95 + for: 10m + labels: + severity: warning + annotations: + description: 'Elasticsearch at {{ $labels.instance }} has high process cpu percent of {{ $value }}.' + summary: 'Elasticsearch process cpu usage is more than 95 percent.' + - alert: es_fs_usage_high + expr: (100 * (elasticsearch_filesystem_data_size_bytes - elasticsearch_filesystem_data_free_bytes) / elasticsearch_filesystem_data_size_bytes) > 80 + for: 10m + labels: + severity: warning + annotations: + description: 'Elasticsearch at {{ $labels.instance }} has filesystem usage of {{ $value }}.' + summary: 'Elasticsearch filesystem usage is high.' + - alert: es_unassigned_shards + expr: elasticsearch_cluster_health_unassigned_shards > 0 + for: 10m + labels: + severity: warning + annotations: + description: 'Elasticsearch has {{ $value }} unassigned shards.' + summary: 'Elasticsearch has unassigned shards and hence a unhealthy cluster state.' + - alert: es_cluster_health_timed_out + expr: elasticsearch_cluster_health_timed_out > 0 + for: 10m + labels: + severity: warning + annotations: + description: 'Elasticsearch cluster health status call timedout {{ $value }} times.' + summary: 'Elasticsearch cluster health status calls are timing out.' + - alert: es_cluster_health_status_alert + expr: elasticsearch_cluster_health_status > 0 + for: 10m + labels: + severity: warning + annotations: + description: 'Elasticsearch cluster health status is not green. One or more shards or replicas are unallocated.' + summary: 'Elasticsearch cluster health status is not green.' + - alert: es_cluster_health_too_few_nodes_running + expr: elasticsearch_cluster_health_number_of_nodes < 3 + for: 10m + labels: + severity: warning + annotations: + description: 'There are only {{$value}} < 3 ElasticSearch nodes running' + summary: 'ElasticSearch running on less than 3 nodes' + - alert: es_cluster_health_too_few_data_nodes_running + expr: elasticsearch_cluster_health_number_of_data_nodes < 3 + for: 10m + labels: + severity: warning + annotations: + description: 'There are only {{$value}} < 3 ElasticSearch data nodes running' + summary: 'ElasticSearch running on less than 3 data nodes' + mariadb: + groups: + - name: mariadb.rules + rules: + - alert: mariadb_table_lock_wait_high + expr: 100 * mysql_global_status_table_locks_waited/(mysql_global_status_table_locks_waited + mysql_global_status_table_locks_immediate) > 30 + for: 10m + labels: + severity: warning + annotations: + description: 'Mariadb has high table lock waits of {{ $value }} percentage' + summary: 'Mariadb table lock waits are high' + - alert: mariadb_node_not_ready + expr: mysql_global_status_wsrep_ready != 1 + for: 10m + labels: + severity: warning + annotations: + description: '{{$labels.job}} on {{$labels.instance}} is not ready.' + summary: 'Galera cluster node not ready' + - alert: mariadb_galera_node_out_of_sync + expr: mysql_global_status_wsrep_local_state != 4 AND mysql_global_variables_wsrep_desync == 0 + for: 10m + labels: + severity: warning + annotations: + description: '{{$labels.job}} on {{$labels.instance}} is not in sync ({{$value}} != 4)' + summary: 'Galera cluster node out of sync' + - alert: mariadb_innodb_replication_fallen_behind + expr: (mysql_global_variables_innodb_replication_delay > 30) AND on (instance) (predict_linear(mysql_global_variables_innodb_replication_delay[5m], 60*2) > 0) + for: 10m + labels: + severity: warning + annotations: + description: 'The mysql innodb replication has fallen behind and is not recovering' + summary: 'MySQL innodb replication is lagging' dependencies: - osh-infra-helm-toolkit ... diff --git a/global/software/charts/osh/openstack-mariadb/mariadb.yaml b/global/software/charts/osh/openstack-mariadb/mariadb.yaml index cae3239a4..d93ef70a0 100644 --- a/global/software/charts/osh/openstack-mariadb/mariadb.yaml +++ b/global/software/charts/osh/openstack-mariadb/mariadb.yaml @@ -31,6 +31,13 @@ metadata: path: .osh.oslo_db dest: path: .values.endpoints.olso_db + - src: + schema: pegleg/EndpointCatalogue/v1 + name: osh_endpoints + path: .osh.prometheus_mysql_exporter + dest: + path: .values.endpoints.prometheus_mysql_exporter + # Accounts - src: schema: pegleg/AccountCatalogue/v1 @@ -38,6 +45,12 @@ metadata: path: .osh.oslo_db.admin dest: path: .values.endpoints.oslo_db.auth.admin + - src: + schema: pegleg/AccountCatalogue/v1 + name: osh_service_accounts + path: .osh.prometheus_mysql_exporter.user + dest: + path: .values.endpoints.prometheus_mysql_exporter.auth.user # Secrets - dest: @@ -46,6 +59,12 @@ metadata: schema: deckhand/Passphrase/v1 name: osh_oslo_db_admin_password path: . + - dest: + path: .values.endpoints.oslo_db.auth.exporter.password + src: + schema: deckhand/Passphrase/v1 + name: osh_oslo_db_exporter_password + path: . data: chart_name: openstack-mariadb @@ -72,6 +91,9 @@ data: prometheus_mysql_exporter: node_selector_key: openstack-control-plane node_selector_value: enabled + monitoring: + prometheus: + enabled: true dependencies: - osh-helm-toolkit ... diff --git a/global/software/charts/ucp/ceph/ceph-mon.yaml b/global/software/charts/ucp/ceph/ceph-mon.yaml index dba3128ba..8cafe3549 100644 --- a/global/software/charts/ucp/ceph/ceph-mon.yaml +++ b/global/software/charts/ucp/ceph/ceph-mon.yaml @@ -130,8 +130,6 @@ data: namespace: ceph ceph_mon: namespace: ceph - fluentd: - namespace: osh-infra deployment: ceph: true storage_secrets: true diff --git a/global/software/charts/ucp/ceph/ceph-osd.yaml b/global/software/charts/ucp/ceph/ceph-osd.yaml index 618d230f8..aea2672e1 100644 --- a/global/software/charts/ucp/ceph/ceph-osd.yaml +++ b/global/software/charts/ucp/ceph/ceph-osd.yaml @@ -122,8 +122,6 @@ data: namespace: ceph ceph_mon: namespace: ceph - fluentd: - namespace: osh-infra bootstrap: enabled: true conf: diff --git a/global/software/charts/ucp/ucp-openstack-exporter/chart-group.yaml b/global/software/charts/ucp/ucp-openstack-exporter/chart-group.yaml new file mode 100644 index 000000000..3d9407ea1 --- /dev/null +++ b/global/software/charts/ucp/ucp-openstack-exporter/chart-group.yaml @@ -0,0 +1,13 @@ +--- +schema: armada/ChartGroup/v1 +metadata: + schema: metadata/Document/v1 + name: ucp-prometheus-openstack-exporter + layeringDefinition: + abstract: false + layer: global + storagePolicy: cleartext +data: + description: Prometheus OpenStack Exporter for UCP Components + chart_group: + - ucp-prometheus-openstack-exporter diff --git a/global/software/charts/ucp/ucp-openstack-exporter/prometheus-openstack-exporter.yaml b/global/software/charts/ucp/ucp-openstack-exporter/prometheus-openstack-exporter.yaml new file mode 100644 index 000000000..10c8bdd58 --- /dev/null +++ b/global/software/charts/ucp/ucp-openstack-exporter/prometheus-openstack-exporter.yaml @@ -0,0 +1,95 @@ +--- +schema: armada/Chart/v1 +metadata: + schema: metadata/Document/v1 + name: ucp-prometheus-openstack-exporter + layeringDefinition: + abstract: false + layer: global + storagePolicy: cleartext + substitutions: + # Chart source + - src: + schema: pegleg/SoftwareVersions/v1 + name: software-versions + path: .charts.osh_infra.prometheus_openstack_exporter + dest: + path: .source + + # Images + - src: + schema: pegleg/SoftwareVersions/v1 + name: software-versions + path: .images.osh_infra.prometheus_openstack_exporter + dest: + path: .values.images.tags + + # Endpoints + - src: + schema: pegleg/EndpointCatalogue/v1 + name: ucp_endpoints + path: .ucp.prometheus_openstack_exporter + dest: + path: .values.endpoints.prometheus_openstack_exporter + - src: + schema: pegleg/EndpointCatalogue/v1 + name: ucp_endpoints + path: .ucp.identity + dest: + path: .values.endpoints.identity + + # Accounts + - src: + schema: pegleg/AccountCatalogue/v1 + name: ucp_service_accounts + path: .ucp.keystone.admin + dest: + path: .values.endpoints.identity.auth.admin + - src: + schema: pegleg/AccountCatalogue/v1 + name: ucp_service_accounts + path: .ucp.prometheus_openstack_exporter.user + dest: + path: .values.endpoints.identity.auth.user + + # Secrets + - dest: + path: .values.endpoints.identity.auth.admin.password + src: + schema: deckhand/Passphrase/v1 + name: ucp_keystone_admin_password + path: . + - dest: + path: .values.endpoints.identity.auth.user.password + src: + schema: deckhand/Passphrase/v1 + name: ucp_openstack_exporter_keystone_password + path: . +data: + chart_name: ucp-prometheus-openstack-exporter + release: ucp-prometheus-openstack-exporter + namespace: ucp + wait: + timeout: 900 + labels: + release_group: airship-ucp-prometheus-openstack-exporter + install: + no_hooks: false + upgrade: + no_hooks: false + pre: + delete: + - type: job + labels: + release_group: airship-ucp-prometheus-openstack-exporter + values: + labels: + openstack_exporter: + node_selector_key: openstack-control-plane + node_selector_value: enabled + job: + node_selector_key: openstack-control-plane + node_selector_value: enabled + dependencies: + - osh-infra-helm-toolkit +... diff --git a/global/software/config/versions.yaml b/global/software/config/versions.yaml index 31589dac6..661e9f2b7 100644 --- a/global/software/config/versions.yaml +++ b/global/software/config/versions.yaml @@ -230,6 +230,11 @@ data: reference: 6ef48d37060e81cc6ffc283644bcd2df6c7ef80e subpath: prometheus-node-exporter type: git + prometheus_process_exporter: + location: https://git.openstack.org/openstack/openstack-helm-infra + reference: 922d7d3d26b15d1d25e40ba1528337002ae15b0c + subpath: prometheus-process-exporter + type: git prometheus_openstack_exporter: location: https://git.openstack.org/openstack/openstack-helm-infra reference: 6ef48d37060e81cc6ffc283644bcd2df6c7ef80e @@ -528,6 +533,7 @@ data: prometheus_kube_state_metrics: {} prometheus_node_exporter: {} prometheus_openstack_exporter: {} + prometheus_process_exporter: {} ucp: armada: api: quay.io/airshipit/armada:32baf9851e9cd29f4b8670e73a9409a0e7a59e35 diff --git a/global/software/manifests/full-site.yaml b/global/software/manifests/full-site.yaml index cd408a9ee..df9a8a5ca 100644 --- a/global/software/manifests/full-site.yaml +++ b/global/software/manifests/full-site.yaml @@ -29,6 +29,7 @@ data: - ucp-drydock - ucp-promenade - ucp-shipyard + - ucp-prometheus-openstack-exporter - osh-infra-ingress-controller - osh-infra-ceph-config - osh-infra-radosgw diff --git a/site/airship-seaworthy/secrets/passphrases/osh_infra_oslo_db_exporter_password.yaml b/site/airship-seaworthy/secrets/passphrases/osh_infra_oslo_db_exporter_password.yaml new file mode 100644 index 000000000..f134f46a9 --- /dev/null +++ b/site/airship-seaworthy/secrets/passphrases/osh_infra_oslo_db_exporter_password.yaml @@ -0,0 +1,11 @@ +--- +schema: deckhand/Passphrase/v1 +metadata: + schema: metadata/Document/v1 + name: osh_infra_oslo_db_exporter_password + layeringDefinition: + abstract: false + layer: site + storagePolicy: cleartext +data: password123 +... diff --git a/site/airship-seaworthy/secrets/passphrases/osh_infra_prometheus_admin_password.yaml b/site/airship-seaworthy/secrets/passphrases/osh_infra_prometheus_admin_password.yaml new file mode 100644 index 000000000..b3df5f659 --- /dev/null +++ b/site/airship-seaworthy/secrets/passphrases/osh_infra_prometheus_admin_password.yaml @@ -0,0 +1,11 @@ +--- +schema: deckhand/Passphrase/v1 +metadata: + schema: metadata/Document/v1 + name: osh_infra_prometheus_admin_password + layeringDefinition: + abstract: false + layer: site + storagePolicy: cleartext +data: password123 +... diff --git a/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_admin_access_key.yaml b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_admin_access_key.yaml index 7fc1eddf1..9f64719a0 100644 --- a/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_admin_access_key.yaml +++ b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_admin_access_key.yaml @@ -7,5 +7,5 @@ metadata: abstract: false layer: site storagePolicy: cleartext -data: password123 +data: admin_access_key ... diff --git a/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_admin_secret_key.yaml b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_admin_secret_key.yaml index 32f7d80f5..3e06f913a 100644 --- a/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_admin_secret_key.yaml +++ b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_admin_secret_key.yaml @@ -7,5 +7,5 @@ metadata: abstract: false layer: site storagePolicy: cleartext -data: password123 +data: admin_secret_key ... diff --git a/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_elasticsearch_access_key.yaml b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_elasticsearch_access_key.yaml index befc16e1f..97c7d2312 100644 --- a/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_elasticsearch_access_key.yaml +++ b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_elasticsearch_access_key.yaml @@ -7,5 +7,5 @@ metadata: abstract: false layer: site storagePolicy: cleartext -data: password123 +data: elastic_access_key ... diff --git a/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_elasticsearch_secret_key.yaml b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_elasticsearch_secret_key.yaml index 6dff56e51..60f0134e0 100644 --- a/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_elasticsearch_secret_key.yaml +++ b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_elasticsearch_secret_key.yaml @@ -7,5 +7,5 @@ metadata: abstract: false layer: site storagePolicy: cleartext -data: password123 +data: elastic_secret_key ... diff --git a/site/airship-seaworthy/secrets/passphrases/osh_oslo_db_exporter_password.yaml b/site/airship-seaworthy/secrets/passphrases/osh_oslo_db_exporter_password.yaml new file mode 100644 index 000000000..61b4144ad --- /dev/null +++ b/site/airship-seaworthy/secrets/passphrases/osh_oslo_db_exporter_password.yaml @@ -0,0 +1,11 @@ +--- +schema: deckhand/Passphrase/v1 +metadata: + schema: metadata/Document/v1 + name: osh_oslo_db_exporter_password + layeringDefinition: + abstract: false + layer: site + storagePolicy: cleartext +data: password123 +... diff --git a/site/airship-seaworthy/secrets/passphrases/ucp_openstack_exporter_keystone_password.yaml b/site/airship-seaworthy/secrets/passphrases/ucp_openstack_exporter_keystone_password.yaml new file mode 100644 index 000000000..b513af431 --- /dev/null +++ b/site/airship-seaworthy/secrets/passphrases/ucp_openstack_exporter_keystone_password.yaml @@ -0,0 +1,11 @@ +--- +schema: deckhand/Passphrase/v1 +metadata: + schema: metadata/Document/v1 + name: ucp_openstack_exporter_keystone_password + layeringDefinition: + abstract: false + layer: site + storagePolicy: cleartext +data: password123 +... diff --git a/site/airship-seaworthy/software/config/endpoints.yaml b/site/airship-seaworthy/software/config/endpoints.yaml index 456dbd02c..c94e72259 100644 --- a/site/airship-seaworthy/software/config/endpoints.yaml +++ b/site/airship-seaworthy/software/config/endpoints.yaml @@ -260,6 +260,19 @@ data: default: "http" host_fqdn_override: default: null + prometheus_openstack_exporter: + namespace: ucp + hosts: + default: openstack-metrics + host_fqdn_override: + default: null + path: + default: null + scheme: + default: "http" + port: + exporter: + default: 9103 ceph: object_store: name: swift @@ -499,6 +512,19 @@ data: default: 3306 wsrep: default: 4567 + prometheus_mysql_exporter: + namespace: openstack + hosts: + default: mysql-exporter + host_fqdn_override: + default: null + path: + default: /metrics + scheme: + default: 'http' + port: + metrics: + default: 9104 keystone_oslo_messaging: namespace: openstack hosts: @@ -1098,6 +1124,19 @@ data: port: mysql: default: 3306 + prometheus_mysql_exporter: + namespace: osh-infra + hosts: + default: mysql-exporter + host_fqdn_override: + default: null + path: + default: /metrics + scheme: + default: 'http' + port: + metrics: + default: 9104 grafana: name: grafana namespace: osh-infra @@ -1132,7 +1171,8 @@ data: port: api: default: 9090 - public: 80 + http: + default: 80 kibana: name: kibana namespace: osh-infra @@ -1209,6 +1249,19 @@ data: default: 9100 prometheus_port: default: 9100 + process_exporter_metrics: + namespace: kube-system + hosts: + default: process-exporter + host_fqdn_override: + default: null + path: + default: null + scheme: + default: "http" + port: + metrics: + default: 9256 prometheus_openstack_exporter: namespace: openstack hosts: diff --git a/site/airship-seaworthy/software/config/service_accounts.yaml b/site/airship-seaworthy/software/config/service_accounts.yaml index a993dee13..9e6ffe99e 100644 --- a/site/airship-seaworthy/software/config/service_accounts.yaml +++ b/site/airship-seaworthy/software/config/service_accounts.yaml @@ -122,6 +122,14 @@ data: postgres: username: deckhand database: deckhand + prometheus_openstack_exporter: + user: + region_name: RegionOne + role: admin + username: prometheus-openstack-exporter + project_name: service + user_domain_name: default + project_domain_name: default ceph: swift: keystone: @@ -303,6 +311,9 @@ data: oslo_db: admin: username: root + prometheus_mysql_exporter: + user: + username: osh-oslodb-exporter neutron: neutron: role: admin @@ -406,6 +417,9 @@ data: oslo_db: admin: username: root + prometheus_mysql_exporter: + user: + username: osh-infra-oslodb-exporter prometheus_openstack_exporter: user: role: admin @@ -416,6 +430,9 @@ data: nagios: admin: username: nagios + prometheus: + admin: + username: prometheus ldap: admin: # NEWSITE-CHANGEME: Replace with the site's LDAP account used to