Fixes etcd race condition bug

- During genesis there was a race condition on the genesis node leaving
   and other nodes joining.
- Updated etcd anchor to update the config when a host is not healthy.

fixes #54

Change-Id: I0ba2c831c73cc3136ee635e7d0c0efcc8b009858
This commit is contained in:
Aaron Sheffield 2018-03-21 15:35:19 -05:00
parent a3b79eabc0
commit cf0037597d
3 changed files with 63 additions and 65 deletions

View File

@ -1,5 +1,5 @@
#!/bin/sh #!/bin/sh
# Copyright 2017 AT&T Intellectual Property. All other rights reserved. # Copyright 2018 AT&T Intellectual Property. All other rights reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
@ -12,37 +12,40 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
set -x set -x
TEMP_MANIFEST=/tmp/etcd.yaml
function copy_certificates { function sync_file {
ETCD_NAME=$1 if ! cmp "$1" "$2"; then
cp -f "$1" "$2"
set -e fi
}
function sync_certificates {
mkdir -p /etcd-etc/tls mkdir -p /etcd-etc/tls
# Copy CA Certificates in place sync_file /etc/etcd/tls/certs/client-ca.pem /etcd-etc/tls/client-ca.pem
cp \ sync_file /etc/etcd/tls/certs/peer-ca.pem /etcd-etc/tls/peer-ca.pem
/etc/etcd/tls/certs/client-ca.pem \ sync_file "/etc/etcd/tls/certs/${ETCD_NAME}-etcd-client.pem" /etcd-etc/tls/etcd-client.pem
/etc/etcd/tls/certs/peer-ca.pem \ sync_file "/etc/etcd/tls/certs/${ETCD_NAME}-etcd-peer.pem" /etcd-etc/tls/etcd-peer.pem
/etcd-etc/tls sync_file "/etc/etcd/tls/keys/${ETCD_NAME}-etcd-client-key.pem" /etcd-etc/tls/etcd-client-key.pem
sync_file "/etc/etcd/tls/keys/${ETCD_NAME}-etcd-peer-key.pem" /etcd-etc/tls/etcd-peer-key.pem
cp /etc/etcd/tls/certs/$ETCD_NAME-etcd-client.pem /etcd-etc/tls/etcd-client.pem
cp /etc/etcd/tls/certs/$ETCD_NAME-etcd-peer.pem /etcd-etc/tls/etcd-peer.pem
cp /etc/etcd/tls/keys/$ETCD_NAME-etcd-client-key.pem /etcd-etc/tls/etcd-client-key.pem
cp /etc/etcd/tls/keys/$ETCD_NAME-etcd-peer-key.pem /etcd-etc/tls/etcd-peer-key.pem
set +e
} }
function create_manifest { function create_manifest {
sed -i -e 's#_ETCD_INITIAL_CLUSTER_STATE_#'$2'#g' /anchor-etcd/{{ .Values.service.name }}.yaml WIP=/tmp/wip-manifest.yaml
sed -i -e 's#_ETCD_INITIAL_CLUSTER_#'$1'#g' /anchor-etcd/{{ .Values.service.name }}.yaml cp -f /anchor-etcd/{{ .Values.service.name }}.yaml $WIP
sed -i -e 's#_ETCD_INITIAL_CLUSTER_STATE_#'$2'#g' $WIP
cp /anchor-etcd/{{ .Values.service.name }}.yaml $MANIFEST_PATH sed -i -e 's#_ETCD_INITIAL_CLUSTER_#'$1'#g' $WIP
mv -f "$WIP" "$3"
} }
function sync_configuration {
sync_certificates
ETCD_INITIAL_CLUSTER=$(grep -v $PEER_ENDPOINT "$1" \
| awk -F ', ' '{ print $3 "=" $4 }' \
| tr '\n' ',' \
| sed "s;\$;$ETCD_NAME=https://\$\(POD_IP\):{{ .Values.network.service_peer.target_port }};")
ETCD_INITIAL_CLUSTER_STATE=existing
create_manifest "$ETCD_INITIAL_CLUSTER" "$ETCD_INITIAL_CLUSTER_STATE" "$TEMP_MANIFEST"
sync_file "${TEMP_MANIFEST}" "${MANIFEST_PATH}"
}
firstrun=true
while true; do while true; do
# TODO(mark-burnett) Need to monitor a file(s) when shutting down/starting # TODO(mark-burnett) Need to monitor a file(s) when shutting down/starting
# up so I don't try to take two actions on the node at once. # up so I don't try to take two actions on the node at once.
@ -67,62 +70,56 @@ while true; do
fi fi
done done
fi fi
if [ -e /bootstrapping/{{ .Values.bootstrapping.filename }} ]; then if [ -e /bootstrapping/{{ .Values.bootstrapping.filename }} ]; then
# Bootstrap the first node # Bootstrap the first node
copy_certificates ${ETCD_NAME} sync_certificates
ETCD_INITIAL_CLUSTER=${ETCD_NAME}=https://\$\(POD_IP\):{{ .Values.network.service_peer.target_port }} ETCD_INITIAL_CLUSTER=${ETCD_NAME}=https://\$\(POD_IP\):{{ .Values.network.service_peer.target_port }}
ETCD_INITIAL_CLUSTER_STATE=new ETCD_INITIAL_CLUSTER_STATE=new
create_manifest $ETCD_INITIAL_CLUSTER $ETCD_INITIAL_CLUSTER_STATE create_manifest "$ETCD_INITIAL_CLUSTER" "$ETCD_INITIAL_CLUSTER_STATE" "$MANIFEST_PATH"
continue continue
fi fi
{{- end }} {{- end }}
sleep {{ .Values.anchor.period }} sleep {{ .Values.anchor.period }}
if [ -e /tmp/stopped ]; then if [ -e /tmp/stopped ]; then
echo Stopping echo Stopping
break break
fi fi
if [ -e /tmp/stopping ]; then if [ -e /tmp/stopping ]; then
echo Waiting to stop.. echo Waiting to stop..
continue continue
fi fi
etcdctl member list > /tmp/members
if [ ! -e $MANIFEST_PATH ]; then # if never started or (ever started and not currently started); then
if ! etcdctl member list > /tmp/members; then # resync
echo Failed to locate existing cluster # fi
if ! grep $PEER_ENDPOINT /tmp/members; then
# If this member is not in the cluster, try to add it.
if grep -v '\bstarted\b' /tmp/members; then
echo Cluster does not appear fully online, waiting.
continue continue
fi fi
# Add this member to the cluster
if ! grep $PEER_ENDPOINT /tmp/members; then if ! etcdctl member add $HOSTNAME --peer-urls $PEER_ENDPOINT; then
if grep -v '\bstarted\b' /tmp/members; then echo Failed to add $HOSTNAME to member list. Waiting.
echo Cluster does not appear fully online, waiting. continue
continue
fi
# Add this member to the cluster
etcdctl member add $HOSTNAME --peer-urls $PEER_ENDPOINT
fi fi
echo Successfully added $HOSTNAME to cluster members.
# If needed, drop the file in place # Refresh member list so we start with the right configuration.
if [ ! -e FILE ]; then etcdctl member list > /tmp/members
# Refresh member list fi
etcdctl member list > /tmp/members if $firstrun; then
sync_configuration /tmp/members
if grep $PEER_ENDPOINT /tmp/members; then firstrun=false
copy_certificates ${ETCD_NAME} fi
if ! ETCDCTL_ENDPOINTS=$CLIENT_ENDPOINT etcdctl endpoint health; then
ETCD_INITIAL_CLUSTER=$(grep -v $PEER_ENDPOINT /tmp/members \ # If not health, sleeps before checking again and then updating configs.
| awk -F ', ' '{ print $3 "=" $4 }' \ echo Member is not healthy, sleeping before checking again.
| tr '\n' ',' \ sleep {{ .Values.anchor.health_wait_period }}
| sed "s;\$;$ETCD_NAME=https://\$\(POD_IP\):{{ .Values.network.service_peer.target_port }};") if ! ETCDCTL_ENDPOINTS=$CLIENT_ENDPOINT etcdctl endpoint health; then
ETCD_INITIAL_CLUSTER_STATE=existing # If still not healthy updates the configs.
echo Member is not healthy, syncing configurations.
create_manifest $ETCD_INITIAL_CLUSTER $ETCD_INITIAL_CLUSTER_STATE sync_configuration /tmp/members
fi continue
fi fi
fi fi
done done

View File

@ -32,6 +32,7 @@ anchor:
manifest_path: /etc/kubernetes/manifests manifest_path: /etc/kubernetes/manifests
period: 15 period: 15
health_wait_period: 60
etcd: etcd:
host_etc_path: /etc/etcd-example host_etc_path: /etc/etcd-example

View File

@ -47,7 +47,7 @@
"-l", "kubernetes-etcd=enabled", "-l", "kubernetes-etcd=enabled",
"-l", "kubernetes-scheduler=enabled", "-l", "kubernetes-scheduler=enabled",
"-l", "ucp-control-plane=enabled", "-l", "ucp-control-plane=enabled",
"-e", "kubernetes n0 genesis n1 n2 n3", "-e", "kubernetes n0 n0 n1 n2 n3",
"-e", "calico n0 n0 n1 n2 n3" "-e", "calico n0 n0 n1 n2 n3"
] ]
}, },