diff --git a/charts/apiserver/templates/daemonset.yaml b/charts/apiserver/templates/daemonset.yaml index f0204d9e..01d0d40e 100644 --- a/charts/apiserver/templates/daemonset.yaml +++ b/charts/apiserver/templates/daemonset.yaml @@ -62,6 +62,17 @@ spec: exec: command: - /tmp/bin/pre_stop + + readinessProbe: + httpGet: + host: 127.0.0.1 + path: /healthz + port: {{ .Values.network.kubernetes_apiserver.port }} + scheme: HTTPS + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 5 + volumeMounts: - name: {{ .Values.service.name }}-certs mountPath: /certs diff --git a/charts/apiserver/templates/etc/_kubernetes-apiserver.yaml.tpl b/charts/apiserver/templates/etc/_kubernetes-apiserver.yaml.tpl index d09b3799..5da319b6 100644 --- a/charts/apiserver/templates/etc/_kubernetes-apiserver.yaml.tpl +++ b/charts/apiserver/templates/etc/_kubernetes-apiserver.yaml.tpl @@ -33,6 +33,7 @@ spec: valueFrom: fieldRef: fieldPath: status.podIP + command: {{- range .Values.command_prefix }} - {{ . }} @@ -58,6 +59,29 @@ spec: ports: - containerPort: {{ .Values.network.kubernetes_apiserver.port }} + + readinessProbe: + httpGet: + host: 127.0.0.1 + path: /healthz + port: {{ .Values.network.kubernetes_apiserver.port }} + scheme: HTTPS + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 5 + + livenessProbe: + failureThreshold: 2 + httpGet: + host: 127.0.0.1 + path: /healthz + port: {{ .Values.network.kubernetes_apiserver.port }} + scheme: HTTPS + initialDelaySeconds: 15 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 10 + volumeMounts: - name: etc mountPath: /etc/kubernetes/apiserver diff --git a/charts/controller_manager/templates/daemonset.yaml b/charts/controller_manager/templates/daemonset.yaml index e350ada7..c69e74ce 100644 --- a/charts/controller_manager/templates/daemonset.yaml +++ b/charts/controller_manager/templates/daemonset.yaml @@ -61,6 +61,16 @@ spec: exec: command: - /tmp/bin/pre_stop + + readinessProbe: + httpGet: + host: 127.0.0.1 + path: /healthz + port: {{ .Values.network.kubernetes_controller_manager.port }} + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 5 + volumeMounts: - name: {{ .Values.service.name }}-bin mountPath: /tmp/bin diff --git a/charts/controller_manager/templates/etc/_kubernetes-controller-manager.yaml.tpl b/charts/controller_manager/templates/etc/_kubernetes-controller-manager.yaml.tpl index 4f6c3027..733f6f3b 100644 --- a/charts/controller_manager/templates/etc/_kubernetes-controller-manager.yaml.tpl +++ b/charts/controller_manager/templates/etc/_kubernetes-controller-manager.yaml.tpl @@ -39,6 +39,8 @@ spec: {{- range .Values.command_prefix }} - {{ . }} {{- end }} + - --address=127.0.0.1 + - --port={{ .Values.network.kubernetes_controller_manager.port }} - --configure-cloud-routes=false - --leader-elect=true - --kubeconfig=/etc/kubernetes/controller-manager/kubeconfig.yaml @@ -47,6 +49,26 @@ spec: - --use-service-account-credentials=true - --v=5 + readinessProbe: + httpGet: + host: 127.0.0.1 + path: /healthz + port: {{ .Values.network.kubernetes_controller_manager.port }} + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 5 + + livenessProbe: + failureThreshold: 2 + httpGet: + host: 127.0.0.1 + path: /healthz + port: {{ .Values.network.kubernetes_controller_manager.port }} + initialDelaySeconds: 15 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 10 + volumeMounts: - name: etc mountPath: /etc/kubernetes/controller-manager diff --git a/charts/controller_manager/values.yaml b/charts/controller_manager/values.yaml index fcd84afc..323a8283 100644 --- a/charts/controller_manager/values.yaml +++ b/charts/controller_manager/values.yaml @@ -68,6 +68,8 @@ network: kubernetes_netloc: 10.96.0.1 pod_cidr: 10.97.0.0/16 service_cidr: 10.96.0.0/16 + kubernetes_controller_manager: + port: 10252 service: name: kubernetes-controller-manager diff --git a/charts/etcd/templates/tests/test-etcd-health.yaml b/charts/etcd/templates/tests/test-etcd-health.yaml index 0c53ca2a..fcd88e2c 100644 --- a/charts/etcd/templates/tests/test-etcd-health.yaml +++ b/charts/etcd/templates/tests/test-etcd-health.yaml @@ -24,18 +24,12 @@ metadata: annotations: "helm.sh/hook": "test-success" spec: + nodeSelector: + {{ .Values.labels.anchor.node_selector_key }}: {{ .Values.labels.anchor.node_selector_value }} restartPolicy: Never containers: - name: "{{ .Release.Name }}-etcd-test" env: - - name: ETCD_NAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - - name: POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - name: ETCDCTL_API value: '3' - name: ETCDCTL_DIAL_TIMEOUT @@ -48,37 +42,19 @@ spec: value: /etc/etcd/tls/certs/anchor-etcd-client.pem - name: ETCDCTL_KEY value: /etc/etcd/tls/keys/anchor-etcd-client-key.pem - - name: CLIENT_ENDPOINT - value: https://$(POD_IP):{{ .Values.network.service_client.target_port }} - - name: PEER_ENDPOINT - value: https://$(POD_IP):{{ .Values.network.service_peer.target_port }} image: {{ .Values.images.tags.etcdctl }} imagePullPolicy: {{ .Values.images.pull_policy }} {{ tuple . .Values.pod.resources.test | include "helm-toolkit.snippets.kubernetes_resources" | indent 6 }} command: - - /bin/sh - - -c - - | - HEALTH=$(etcdctl endpoint health) - CODE=$? - echo $HEALTH - exit $CODE + - etcdctl + - endpoint + - health volumeMounts: - - name: {{ .Values.service.name }}-bin - mountPath: /tmp - name: {{ .Values.service.name }}-certs mountPath: /etc/etcd/tls/certs - - name: etcd-etc - mountPath: /etcd-etc - name: {{ .Values.service.name }}-keys mountPath: /etc/etcd/tls/keys - - name: {{ .Values.service.name }}-etc - mountPath: /anchor-etcd volumes: - - name: {{ .Values.service.name }}-bin - configMap: - name: {{ .Values.service.name }}-bin - defaultMode: 0555 - name: {{ .Values.service.name }}-certs configMap: name: {{ .Values.service.name }}-certs @@ -87,12 +63,5 @@ spec: secret: secretName: {{ .Values.service.name }}-keys defaultMode: 0444 - - name: etcd-etc - hostPath: - path: {{ .Values.etcd.host_etc_path }} - - name: {{ .Values.service.name }}-etc - configMap: - name: {{ .Values.service.name }}-etc - defaultMode: 0444 ... {{- end }} diff --git a/charts/scheduler/templates/etc/_kubernetes-scheduler.yaml.tpl b/charts/scheduler/templates/etc/_kubernetes-scheduler.yaml.tpl index bf7a960e..3467f222 100644 --- a/charts/scheduler/templates/etc/_kubernetes-scheduler.yaml.tpl +++ b/charts/scheduler/templates/etc/_kubernetes-scheduler.yaml.tpl @@ -39,9 +39,31 @@ spec: {{- range .Values.command_prefix }} - {{ . }} {{- end }} + - --address=127.0.0.1 + - --port={{ .Values.network.kubernetes_scheduler.port }} - --leader-elect=true - --kubeconfig=/etc/kubernetes/scheduler/kubeconfig.yaml + readinessProbe: + httpGet: + host: 127.0.0.1 + path: /healthz + port: {{ .Values.network.kubernetes_scheduler.port }} + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 5 + + livenessProbe: + failureThreshold: 2 + httpGet: + host: 127.0.0.1 + path: /healthz + port: {{ .Values.network.kubernetes_scheduler.port }} + initialDelaySeconds: 15 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 15 + volumeMounts: - name: etc mountPath: /etc/kubernetes/scheduler diff --git a/charts/scheduler/templates/sched-anchor.yaml b/charts/scheduler/templates/sched-anchor.yaml index e40080cb..20107290 100644 --- a/charts/scheduler/templates/sched-anchor.yaml +++ b/charts/scheduler/templates/sched-anchor.yaml @@ -53,6 +53,16 @@ spec: exec: command: - /tmp/bin/pre_stop + + readinessProbe: + httpGet: + host: 127.0.0.1 + path: /healthz + port: {{ .Values.network.kubernetes_scheduler.port }} + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 5 + volumeMounts: - name: bin mountPath: /tmp/bin diff --git a/charts/scheduler/values.yaml b/charts/scheduler/values.yaml index ad5053a0..c8e52079 100644 --- a/charts/scheduler/values.yaml +++ b/charts/scheduler/values.yaml @@ -69,6 +69,8 @@ images: network: kubernetes_netloc: 10.96.0.1 + kubernetes_scheduler: + port: 10251 service: name: kubernetes-scheduler diff --git a/examples/basic/armada-resources.yaml b/examples/basic/armada-resources.yaml index d275954f..acd625ae 100644 --- a/examples/basic/armada-resources.yaml +++ b/examples/basic/armada-resources.yaml @@ -68,6 +68,7 @@ metadata: storagePolicy: cleartext data: description: Kubernetes components + sequenced: true chart_group: - haproxy - kubernetes-etcd @@ -86,6 +87,7 @@ metadata: storagePolicy: cleartext data: description: UCP platform components + sequenced: true chart_group: - promenade --- @@ -585,6 +587,10 @@ data: kubernetes-apiserver: server_opts: "check port 6443" conf_parts: + global: + - timeout connect 5000ms + - timeout client 30s + - timeout server 30s frontend: - mode tcp - bind *:6553 @@ -1107,6 +1113,9 @@ data: no_hooks: false upgrade: no_hooks: false + timeout: 600 + wait: + timeout: 600 values: images: tags: diff --git a/examples/complete/armada-resources.yaml b/examples/complete/armada-resources.yaml index 3456df4f..eb8a64ff 100644 --- a/examples/complete/armada-resources.yaml +++ b/examples/complete/armada-resources.yaml @@ -373,6 +373,7 @@ data: timeout: 600 wait: timeout: 600 + test: true upgrade: no_hooks: true values: diff --git a/promenade/templates/include/genesis-etcd/common-volumes.yaml b/promenade/templates/include/genesis-etcd/common-volumes.yaml index a098fa66..9e8b9bac 100644 --- a/promenade/templates/include/genesis-etcd/common-volumes.yaml +++ b/promenade/templates/include/genesis-etcd/common-volumes.yaml @@ -1,6 +1,6 @@ - - name: data + - name: data-{{ etcd_name }} hostPath: - path: /var/lib/etcd/{{ etcd_name }} - - name: pki + path: /var/lib/etcd/{{ volume_name }} + - name: pki-{{ etcd_name }} hostPath: path: /etc/genesis/etcd/pki diff --git a/promenade/templates/include/genesis-etcd/server-container.yaml b/promenade/templates/include/genesis-etcd/server-container.yaml index f6ba4567..5744097c 100644 --- a/promenade/templates/include/genesis-etcd/server-container.yaml +++ b/promenade/templates/include/genesis-etcd/server-container.yaml @@ -1,4 +1,4 @@ - - name: etcd + - name: etcd-{{ etcd_name }} image: {{ config['Genesis:images.kubernetes.etcd'] }} env: - name: ETCD_NAME @@ -38,7 +38,7 @@ - name: ETCD_INITIAL_CLUSTER_STATE value: new - name: ETCD_INITIAL_CLUSTER - value: genesis=https://{{ config['Genesis:ip'] }}:2380,auxiliary=https://{{ config['Genesis:ip'] }}:12380 + value: {{ config['Genesis:hostname'] }}=https://{{ config['Genesis:ip'] }}:2380,auxiliary-0=https://{{ config['Genesis:ip'] }}:12380,auxiliary-1=https://{{ config['Genesis:ip'] }}:22380 - name: ETCDCTL_API value: '3' - name: ETCDCTL_DIAL_TIMEOUT @@ -57,7 +57,7 @@ - name: peer containerPort: {{ peer_port }} volumeMounts: - - name: data + - name: data-{{ etcd_name }} mountPath: /var/lib/etcd - - name: pki + - name: pki-{{ etcd_name }} mountPath: /etc/etcd/pki diff --git a/promenade/templates/roles/genesis/etc/kubernetes/manifests/auxiliary-kubernetes-etcd.yaml b/promenade/templates/roles/genesis/etc/kubernetes/manifests/auxiliary-kubernetes-etcd.yaml index 78d220c2..8d94b700 100644 --- a/promenade/templates/roles/genesis/etc/kubernetes/manifests/auxiliary-kubernetes-etcd.yaml +++ b/promenade/templates/roles/genesis/etc/kubernetes/manifests/auxiliary-kubernetes-etcd.yaml @@ -9,9 +9,15 @@ metadata: promenade: genesis spec: hostNetwork: true -{%- with etcd_name = 'auxiliary', client_port = 12379, peer_port = 12380 %} containers: +{%- with etcd_name = 'auxiliary-0', client_port = 12379, peer_port = 12380 %} {% include "genesis-etcd/server-container.yaml" with context %} +{%- endwith %} + +{%- with etcd_name = 'auxiliary-1', client_port = 22379, peer_port = 22380 %} +{% include "genesis-etcd/server-container.yaml" with context %} +{%- endwith %} + - name: monitor image: {{ config['Genesis:images.kubernetes.etcd'] }} command: @@ -19,21 +25,41 @@ spec: - -c - |- set -x - MEMBER_COUNT=$(etcdctl member list | grep '\bstarted\b' | wc -l) - if [ $MEMBER_COUNT -gt 1 ]; then - MEMBER_ID=$(etcdctl member list | grep auxiliary | awk -F ', ' '{ print $1 }') - if [ -n $MEMBER_ID ]; then - while [ $MEMBER_COUNT -lt 3 ]; do - sleep 30 - MEMBER_COUNT=$(etcdctl member list | grep '\bstarted\b' | wc -l) - done - set -e + + function external_member_count() { + etcdctl member list \ + | grep '\bstarted\b' \ + | grep -Ev "\\b({{ config['Genesis:hostname'] }}|auxiliary-0|auxiliary-1)\\b" \ + | wc -l + } + + function remove_if_possible() { + MEMBER_NAME=$1 + MEMBER_ID=$(etcdctl member list | grep "${MEMBER_NAME}" | awk -F ', ' '{ print $1 }') + if [ -n "${MEMBER_ID}" ]; then etcdctl member remove $MEMBER_ID fi + } - rm -rf /var/lib/etcd/* /manifests/auxiliary-kubernetes-etcd.yaml - sleep 10000 - fi + # NOTE(mark-burnett): If there are any non-genesis members, then we are ready to + # remove the auxiliary members. Otherwise, wait. + while [ ! "$(external_member_count)" -gt 0 ]; do + sleep 10 + done + + # NOTE(mark-burnett): Failures beyond this point are unexpected, but + # should be recovered by restarting this container. + set -e + + remove_if_possible auxiliary-0 + remove_if_possible auxiliary-1 + + rm -rf \ + /var/lib/etcd/auxiliary-0 \ + /var/lib/etcd/auxiliary-1 \ + /manifests/auxiliary-kubernetes-etcd.yaml + + sleep 10000 env: - name: ETCDCTL_API value: '3' @@ -48,16 +74,23 @@ spec: - name: ETCDCTL_KEY value: /etc/etcd/pki/etcd-client-key.pem volumeMounts: - - name: data + - name: all-etcd-data mountPath: /var/lib/etcd - - name: pki + - name: pki-auxiliary-0 mountPath: /etc/etcd/pki - name: manifest mountPath: /manifests volumes: +{%- with etcd_name = 'auxiliary-0', client_port = 12379, peer_port = 12380, volume_name = 'auxiliary-0' %} {% include "genesis-etcd/common-volumes.yaml" with context %} +{%- endwith %} +{%- with etcd_name = 'auxiliary-1', client_port = 22379, peer_port = 22380, volume_name = 'auxiliary-1' %} +{% include "genesis-etcd/common-volumes.yaml" with context %} +{%- endwith %} - name: manifest hostPath: path: /etc/kubernetes/manifests -{%- endwith %} + - name: all-etcd-data + hostPath: + path: /var/lib/etcd ... diff --git a/promenade/templates/roles/genesis/etc/kubernetes/manifests/bootstrap-armada.yaml b/promenade/templates/roles/genesis/etc/kubernetes/manifests/bootstrap-armada.yaml index d2f29fcc..4abe43d8 100644 --- a/promenade/templates/roles/genesis/etc/kubernetes/manifests/bootstrap-armada.yaml +++ b/promenade/templates/roles/genesis/etc/kubernetes/manifests/bootstrap-armada.yaml @@ -59,7 +59,7 @@ spec: while true; do sleep 10 - if armada --debug \ + if armada \ apply \ --target-manifest {{ config.get_path('Genesis:armada.target_manifest', 'cluster-bootstrap') }} \ --tiller-host 127.0.0.1 \ @@ -134,7 +134,7 @@ spec: - --bind-address=0.0.0.0 - --runtime-config=batch/v2alpha1=true - --allow-privileged=true - - --etcd-servers=https://localhost:2379 + - --etcd-servers=https://localhost:12379 - --etcd-cafile=/etc/kubernetes/apiserver/pki/etcd-client-ca.pem - --etcd-certfile=/etc/kubernetes/apiserver/pki/etcd-client.pem - --etcd-keyfile=/etc/kubernetes/apiserver/pki/etcd-client-key.pem diff --git a/promenade/templates/roles/genesis/etc/kubernetes/manifests/kubernetes-etcd.yaml b/promenade/templates/roles/genesis/etc/kubernetes/manifests/kubernetes-etcd.yaml index 8592b31b..4375af22 100644 --- a/promenade/templates/roles/genesis/etc/kubernetes/manifests/kubernetes-etcd.yaml +++ b/promenade/templates/roles/genesis/etc/kubernetes/manifests/kubernetes-etcd.yaml @@ -9,7 +9,7 @@ metadata: kubernetes-etcd-service: enabled spec: hostNetwork: true -{%- with etcd_name = 'genesis', client_port = 2379, peer_port = 2380 %} +{%- with etcd_name = config['Genesis:hostname'], client_port = 2379, peer_port = 2380, volume_name = 'kubernetes' %} containers: {% include "genesis-etcd/server-container.yaml" with context %} volumes: diff --git a/tools/g2/lib/virsh.sh b/tools/g2/lib/virsh.sh index df436e9b..c05af100 100644 --- a/tools/g2/lib/virsh.sh +++ b/tools/g2/lib/virsh.sh @@ -134,8 +134,6 @@ vm_create() { --memory "$(config_vm_memory)" \ --import \ --disk "vol=${VIRSH_POOL}/promenade-${NAME}.img,${DISK_OPTS}" \ - --disk "pool=${VIRSH_POOL},size=20,${DISK_OPTS}" \ - --disk "pool=${VIRSH_POOL},size=20,${DISK_OPTS}" \ --disk "vol=${VIRSH_POOL}/cloud-init-${NAME}.iso,device=cdrom" &>> "${LOG_FILE}" ssh_wait "${NAME}" diff --git a/tools/gate/config-templates/bootstrap-armada-config.yaml b/tools/gate/config-templates/bootstrap-armada-config.yaml index fc0b73e7..6c01f5cf 100644 --- a/tools/gate/config-templates/bootstrap-armada-config.yaml +++ b/tools/gate/config-templates/bootstrap-armada-config.yaml @@ -324,6 +324,9 @@ data: release: calico-etcd namespace: kube-system timeout: 600 + wait: + timeout: 6000 + test: true upgrade: no_hooks: true values: @@ -953,6 +956,9 @@ data: release: kubernetes-etcd namespace: kube-system timeout: 600 + wait: + timeout: 6000 + test: true upgrade: no_hooks: true values: