Add a second auxiliary etcd server for bootstrap

This adds stability to etcd and enables cleaner waiting by tiller during
deployment of the Kubernetes apiserver and etcd.

* Adds second auxiliary etcd process.
* Enables "sequenced" for remaining ChartGroups.
* Removes unused disks from test VMs.
* Add readiness and liveness probes for kubernetes components

Change-Id: I6f83bb912f76b0ec35503723b417ba45d69e39c5
This commit is contained in:
Mark Burnett 2018-04-16 11:09:55 -05:00
parent 75d95c775f
commit 8bbb84af97
18 changed files with 183 additions and 64 deletions

View File

@ -62,6 +62,17 @@ spec:
exec:
command:
- /tmp/bin/pre_stop
readinessProbe:
httpGet:
host: 127.0.0.1
path: /healthz
port: {{ .Values.network.kubernetes_apiserver.port }}
scheme: HTTPS
initialDelaySeconds: 10
periodSeconds: 5
timeoutSeconds: 5
volumeMounts:
- name: {{ .Values.service.name }}-certs
mountPath: /certs

View File

@ -33,6 +33,7 @@ spec:
valueFrom:
fieldRef:
fieldPath: status.podIP
command:
{{- range .Values.command_prefix }}
- {{ . }}
@ -58,6 +59,29 @@ spec:
ports:
- containerPort: {{ .Values.network.kubernetes_apiserver.port }}
readinessProbe:
httpGet:
host: 127.0.0.1
path: /healthz
port: {{ .Values.network.kubernetes_apiserver.port }}
scheme: HTTPS
initialDelaySeconds: 10
periodSeconds: 5
timeoutSeconds: 5
livenessProbe:
failureThreshold: 2
httpGet:
host: 127.0.0.1
path: /healthz
port: {{ .Values.network.kubernetes_apiserver.port }}
scheme: HTTPS
initialDelaySeconds: 15
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 10
volumeMounts:
- name: etc
mountPath: /etc/kubernetes/apiserver

View File

@ -61,6 +61,16 @@ spec:
exec:
command:
- /tmp/bin/pre_stop
readinessProbe:
httpGet:
host: 127.0.0.1
path: /healthz
port: {{ .Values.network.kubernetes_controller_manager.port }}
initialDelaySeconds: 10
periodSeconds: 5
timeoutSeconds: 5
volumeMounts:
- name: {{ .Values.service.name }}-bin
mountPath: /tmp/bin

View File

@ -39,6 +39,8 @@ spec:
{{- range .Values.command_prefix }}
- {{ . }}
{{- end }}
- --address=127.0.0.1
- --port={{ .Values.network.kubernetes_controller_manager.port }}
- --configure-cloud-routes=false
- --leader-elect=true
- --kubeconfig=/etc/kubernetes/controller-manager/kubeconfig.yaml
@ -47,6 +49,26 @@ spec:
- --use-service-account-credentials=true
- --v=5
readinessProbe:
httpGet:
host: 127.0.0.1
path: /healthz
port: {{ .Values.network.kubernetes_controller_manager.port }}
initialDelaySeconds: 10
periodSeconds: 5
timeoutSeconds: 5
livenessProbe:
failureThreshold: 2
httpGet:
host: 127.0.0.1
path: /healthz
port: {{ .Values.network.kubernetes_controller_manager.port }}
initialDelaySeconds: 15
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 10
volumeMounts:
- name: etc
mountPath: /etc/kubernetes/controller-manager

View File

@ -68,6 +68,8 @@ network:
kubernetes_netloc: 10.96.0.1
pod_cidr: 10.97.0.0/16
service_cidr: 10.96.0.0/16
kubernetes_controller_manager:
port: 10252
service:
name: kubernetes-controller-manager

View File

@ -24,18 +24,12 @@ metadata:
annotations:
"helm.sh/hook": "test-success"
spec:
nodeSelector:
{{ .Values.labels.anchor.node_selector_key }}: {{ .Values.labels.anchor.node_selector_value }}
restartPolicy: Never
containers:
- name: "{{ .Release.Name }}-etcd-test"
env:
- name: ETCD_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: ETCDCTL_API
value: '3'
- name: ETCDCTL_DIAL_TIMEOUT
@ -48,37 +42,19 @@ spec:
value: /etc/etcd/tls/certs/anchor-etcd-client.pem
- name: ETCDCTL_KEY
value: /etc/etcd/tls/keys/anchor-etcd-client-key.pem
- name: CLIENT_ENDPOINT
value: https://$(POD_IP):{{ .Values.network.service_client.target_port }}
- name: PEER_ENDPOINT
value: https://$(POD_IP):{{ .Values.network.service_peer.target_port }}
image: {{ .Values.images.tags.etcdctl }}
imagePullPolicy: {{ .Values.images.pull_policy }}
{{ tuple . .Values.pod.resources.test | include "helm-toolkit.snippets.kubernetes_resources" | indent 6 }}
command:
- /bin/sh
- -c
- |
HEALTH=$(etcdctl endpoint health)
CODE=$?
echo $HEALTH
exit $CODE
- etcdctl
- endpoint
- health
volumeMounts:
- name: {{ .Values.service.name }}-bin
mountPath: /tmp
- name: {{ .Values.service.name }}-certs
mountPath: /etc/etcd/tls/certs
- name: etcd-etc
mountPath: /etcd-etc
- name: {{ .Values.service.name }}-keys
mountPath: /etc/etcd/tls/keys
- name: {{ .Values.service.name }}-etc
mountPath: /anchor-etcd
volumes:
- name: {{ .Values.service.name }}-bin
configMap:
name: {{ .Values.service.name }}-bin
defaultMode: 0555
- name: {{ .Values.service.name }}-certs
configMap:
name: {{ .Values.service.name }}-certs
@ -87,12 +63,5 @@ spec:
secret:
secretName: {{ .Values.service.name }}-keys
defaultMode: 0444
- name: etcd-etc
hostPath:
path: {{ .Values.etcd.host_etc_path }}
- name: {{ .Values.service.name }}-etc
configMap:
name: {{ .Values.service.name }}-etc
defaultMode: 0444
...
{{- end }}

View File

@ -39,9 +39,31 @@ spec:
{{- range .Values.command_prefix }}
- {{ . }}
{{- end }}
- --address=127.0.0.1
- --port={{ .Values.network.kubernetes_scheduler.port }}
- --leader-elect=true
- --kubeconfig=/etc/kubernetes/scheduler/kubeconfig.yaml
readinessProbe:
httpGet:
host: 127.0.0.1
path: /healthz
port: {{ .Values.network.kubernetes_scheduler.port }}
initialDelaySeconds: 10
periodSeconds: 5
timeoutSeconds: 5
livenessProbe:
failureThreshold: 2
httpGet:
host: 127.0.0.1
path: /healthz
port: {{ .Values.network.kubernetes_scheduler.port }}
initialDelaySeconds: 15
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 15
volumeMounts:
- name: etc
mountPath: /etc/kubernetes/scheduler

View File

@ -53,6 +53,16 @@ spec:
exec:
command:
- /tmp/bin/pre_stop
readinessProbe:
httpGet:
host: 127.0.0.1
path: /healthz
port: {{ .Values.network.kubernetes_scheduler.port }}
initialDelaySeconds: 10
periodSeconds: 5
timeoutSeconds: 5
volumeMounts:
- name: bin
mountPath: /tmp/bin

View File

@ -69,6 +69,8 @@ images:
network:
kubernetes_netloc: 10.96.0.1
kubernetes_scheduler:
port: 10251
service:
name: kubernetes-scheduler

View File

@ -68,6 +68,7 @@ metadata:
storagePolicy: cleartext
data:
description: Kubernetes components
sequenced: true
chart_group:
- haproxy
- kubernetes-etcd
@ -86,6 +87,7 @@ metadata:
storagePolicy: cleartext
data:
description: UCP platform components
sequenced: true
chart_group:
- promenade
---
@ -585,6 +587,10 @@ data:
kubernetes-apiserver:
server_opts: "check port 6443"
conf_parts:
global:
- timeout connect 5000ms
- timeout client 30s
- timeout server 30s
frontend:
- mode tcp
- bind *:6553
@ -1107,6 +1113,9 @@ data:
no_hooks: false
upgrade:
no_hooks: false
timeout: 600
wait:
timeout: 600
values:
images:
tags:

View File

@ -373,6 +373,7 @@ data:
timeout: 600
wait:
timeout: 600
test: true
upgrade:
no_hooks: true
values:

View File

@ -1,6 +1,6 @@
- name: data
- name: data-{{ etcd_name }}
hostPath:
path: /var/lib/etcd/{{ etcd_name }}
- name: pki
path: /var/lib/etcd/{{ volume_name }}
- name: pki-{{ etcd_name }}
hostPath:
path: /etc/genesis/etcd/pki

View File

@ -1,4 +1,4 @@
- name: etcd
- name: etcd-{{ etcd_name }}
image: {{ config['Genesis:images.kubernetes.etcd'] }}
env:
- name: ETCD_NAME
@ -38,7 +38,7 @@
- name: ETCD_INITIAL_CLUSTER_STATE
value: new
- name: ETCD_INITIAL_CLUSTER
value: genesis=https://{{ config['Genesis:ip'] }}:2380,auxiliary=https://{{ config['Genesis:ip'] }}:12380
value: {{ config['Genesis:hostname'] }}=https://{{ config['Genesis:ip'] }}:2380,auxiliary-0=https://{{ config['Genesis:ip'] }}:12380,auxiliary-1=https://{{ config['Genesis:ip'] }}:22380
- name: ETCDCTL_API
value: '3'
- name: ETCDCTL_DIAL_TIMEOUT
@ -57,7 +57,7 @@
- name: peer
containerPort: {{ peer_port }}
volumeMounts:
- name: data
- name: data-{{ etcd_name }}
mountPath: /var/lib/etcd
- name: pki
- name: pki-{{ etcd_name }}
mountPath: /etc/etcd/pki

View File

@ -9,9 +9,15 @@ metadata:
promenade: genesis
spec:
hostNetwork: true
{%- with etcd_name = 'auxiliary', client_port = 12379, peer_port = 12380 %}
containers:
{%- with etcd_name = 'auxiliary-0', client_port = 12379, peer_port = 12380 %}
{% include "genesis-etcd/server-container.yaml" with context %}
{%- endwith %}
{%- with etcd_name = 'auxiliary-1', client_port = 22379, peer_port = 22380 %}
{% include "genesis-etcd/server-container.yaml" with context %}
{%- endwith %}
- name: monitor
image: {{ config['Genesis:images.kubernetes.etcd'] }}
command:
@ -19,21 +25,41 @@ spec:
- -c
- |-
set -x
MEMBER_COUNT=$(etcdctl member list | grep '\bstarted\b' | wc -l)
if [ $MEMBER_COUNT -gt 1 ]; then
MEMBER_ID=$(etcdctl member list | grep auxiliary | awk -F ', ' '{ print $1 }')
if [ -n $MEMBER_ID ]; then
while [ $MEMBER_COUNT -lt 3 ]; do
sleep 30
MEMBER_COUNT=$(etcdctl member list | grep '\bstarted\b' | wc -l)
done
set -e
function external_member_count() {
etcdctl member list \
| grep '\bstarted\b' \
| grep -Ev "\\b({{ config['Genesis:hostname'] }}|auxiliary-0|auxiliary-1)\\b" \
| wc -l
}
function remove_if_possible() {
MEMBER_NAME=$1
MEMBER_ID=$(etcdctl member list | grep "${MEMBER_NAME}" | awk -F ', ' '{ print $1 }')
if [ -n "${MEMBER_ID}" ]; then
etcdctl member remove $MEMBER_ID
fi
}
rm -rf /var/lib/etcd/* /manifests/auxiliary-kubernetes-etcd.yaml
sleep 10000
fi
# NOTE(mark-burnett): If there are any non-genesis members, then we are ready to
# remove the auxiliary members. Otherwise, wait.
while [ ! "$(external_member_count)" -gt 0 ]; do
sleep 10
done
# NOTE(mark-burnett): Failures beyond this point are unexpected, but
# should be recovered by restarting this container.
set -e
remove_if_possible auxiliary-0
remove_if_possible auxiliary-1
rm -rf \
/var/lib/etcd/auxiliary-0 \
/var/lib/etcd/auxiliary-1 \
/manifests/auxiliary-kubernetes-etcd.yaml
sleep 10000
env:
- name: ETCDCTL_API
value: '3'
@ -48,16 +74,23 @@ spec:
- name: ETCDCTL_KEY
value: /etc/etcd/pki/etcd-client-key.pem
volumeMounts:
- name: data
- name: all-etcd-data
mountPath: /var/lib/etcd
- name: pki
- name: pki-auxiliary-0
mountPath: /etc/etcd/pki
- name: manifest
mountPath: /manifests
volumes:
{%- with etcd_name = 'auxiliary-0', client_port = 12379, peer_port = 12380, volume_name = 'auxiliary-0' %}
{% include "genesis-etcd/common-volumes.yaml" with context %}
{%- endwith %}
{%- with etcd_name = 'auxiliary-1', client_port = 22379, peer_port = 22380, volume_name = 'auxiliary-1' %}
{% include "genesis-etcd/common-volumes.yaml" with context %}
{%- endwith %}
- name: manifest
hostPath:
path: /etc/kubernetes/manifests
{%- endwith %}
- name: all-etcd-data
hostPath:
path: /var/lib/etcd
...

View File

@ -59,7 +59,7 @@ spec:
while true; do
sleep 10
if armada --debug \
if armada \
apply \
--target-manifest {{ config.get_path('Genesis:armada.target_manifest', 'cluster-bootstrap') }} \
--tiller-host 127.0.0.1 \
@ -134,7 +134,7 @@ spec:
- --bind-address=0.0.0.0
- --runtime-config=batch/v2alpha1=true
- --allow-privileged=true
- --etcd-servers=https://localhost:2379
- --etcd-servers=https://localhost:12379
- --etcd-cafile=/etc/kubernetes/apiserver/pki/etcd-client-ca.pem
- --etcd-certfile=/etc/kubernetes/apiserver/pki/etcd-client.pem
- --etcd-keyfile=/etc/kubernetes/apiserver/pki/etcd-client-key.pem

View File

@ -9,7 +9,7 @@ metadata:
kubernetes-etcd-service: enabled
spec:
hostNetwork: true
{%- with etcd_name = 'genesis', client_port = 2379, peer_port = 2380 %}
{%- with etcd_name = config['Genesis:hostname'], client_port = 2379, peer_port = 2380, volume_name = 'kubernetes' %}
containers:
{% include "genesis-etcd/server-container.yaml" with context %}
volumes:

View File

@ -134,8 +134,6 @@ vm_create() {
--memory "$(config_vm_memory)" \
--import \
--disk "vol=${VIRSH_POOL}/promenade-${NAME}.img,${DISK_OPTS}" \
--disk "pool=${VIRSH_POOL},size=20,${DISK_OPTS}" \
--disk "pool=${VIRSH_POOL},size=20,${DISK_OPTS}" \
--disk "vol=${VIRSH_POOL}/cloud-init-${NAME}.iso,device=cdrom" &>> "${LOG_FILE}"
ssh_wait "${NAME}"

View File

@ -324,6 +324,9 @@ data:
release: calico-etcd
namespace: kube-system
timeout: 600
wait:
timeout: 6000
test: true
upgrade:
no_hooks: true
values:
@ -953,6 +956,9 @@ data:
release: kubernetes-etcd
namespace: kube-system
timeout: 600
wait:
timeout: 6000
test: true
upgrade:
no_hooks: true
values: