Fixes etcd race condition bug
- During genesis there was a race condition on the genesis node leaving and other nodes joining. - Updated etcd anchor to update the config when a host is not healthy. fixes #54 Change-Id: I0ba2c831c73cc3136ee635e7d0c0efcc8b009858
This commit is contained in:
parent
a3b79eabc0
commit
cf0037597d
|
@ -1,5 +1,5 @@
|
|||
#!/bin/sh
|
||||
# Copyright 2017 AT&T Intellectual Property. All other rights reserved.
|
||||
# Copyright 2018 AT&T Intellectual Property. All other rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
|
@ -12,37 +12,40 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
set -x
|
||||
|
||||
function copy_certificates {
|
||||
ETCD_NAME=$1
|
||||
|
||||
set -e
|
||||
|
||||
TEMP_MANIFEST=/tmp/etcd.yaml
|
||||
function sync_file {
|
||||
if ! cmp "$1" "$2"; then
|
||||
cp -f "$1" "$2"
|
||||
fi
|
||||
}
|
||||
function sync_certificates {
|
||||
mkdir -p /etcd-etc/tls
|
||||
# Copy CA Certificates in place
|
||||
cp \
|
||||
/etc/etcd/tls/certs/client-ca.pem \
|
||||
/etc/etcd/tls/certs/peer-ca.pem \
|
||||
/etcd-etc/tls
|
||||
|
||||
cp /etc/etcd/tls/certs/$ETCD_NAME-etcd-client.pem /etcd-etc/tls/etcd-client.pem
|
||||
cp /etc/etcd/tls/certs/$ETCD_NAME-etcd-peer.pem /etcd-etc/tls/etcd-peer.pem
|
||||
|
||||
cp /etc/etcd/tls/keys/$ETCD_NAME-etcd-client-key.pem /etcd-etc/tls/etcd-client-key.pem
|
||||
cp /etc/etcd/tls/keys/$ETCD_NAME-etcd-peer-key.pem /etcd-etc/tls/etcd-peer-key.pem
|
||||
|
||||
set +e
|
||||
sync_file /etc/etcd/tls/certs/client-ca.pem /etcd-etc/tls/client-ca.pem
|
||||
sync_file /etc/etcd/tls/certs/peer-ca.pem /etcd-etc/tls/peer-ca.pem
|
||||
sync_file "/etc/etcd/tls/certs/${ETCD_NAME}-etcd-client.pem" /etcd-etc/tls/etcd-client.pem
|
||||
sync_file "/etc/etcd/tls/certs/${ETCD_NAME}-etcd-peer.pem" /etcd-etc/tls/etcd-peer.pem
|
||||
sync_file "/etc/etcd/tls/keys/${ETCD_NAME}-etcd-client-key.pem" /etcd-etc/tls/etcd-client-key.pem
|
||||
sync_file "/etc/etcd/tls/keys/${ETCD_NAME}-etcd-peer-key.pem" /etcd-etc/tls/etcd-peer-key.pem
|
||||
}
|
||||
|
||||
function create_manifest {
|
||||
sed -i -e 's#_ETCD_INITIAL_CLUSTER_STATE_#'$2'#g' /anchor-etcd/{{ .Values.service.name }}.yaml
|
||||
sed -i -e 's#_ETCD_INITIAL_CLUSTER_#'$1'#g' /anchor-etcd/{{ .Values.service.name }}.yaml
|
||||
|
||||
cp /anchor-etcd/{{ .Values.service.name }}.yaml $MANIFEST_PATH
|
||||
WIP=/tmp/wip-manifest.yaml
|
||||
cp -f /anchor-etcd/{{ .Values.service.name }}.yaml $WIP
|
||||
sed -i -e 's#_ETCD_INITIAL_CLUSTER_STATE_#'$2'#g' $WIP
|
||||
sed -i -e 's#_ETCD_INITIAL_CLUSTER_#'$1'#g' $WIP
|
||||
mv -f "$WIP" "$3"
|
||||
}
|
||||
|
||||
function sync_configuration {
|
||||
sync_certificates
|
||||
ETCD_INITIAL_CLUSTER=$(grep -v $PEER_ENDPOINT "$1" \
|
||||
| awk -F ', ' '{ print $3 "=" $4 }' \
|
||||
| tr '\n' ',' \
|
||||
| sed "s;\$;$ETCD_NAME=https://\$\(POD_IP\):{{ .Values.network.service_peer.target_port }};")
|
||||
ETCD_INITIAL_CLUSTER_STATE=existing
|
||||
create_manifest "$ETCD_INITIAL_CLUSTER" "$ETCD_INITIAL_CLUSTER_STATE" "$TEMP_MANIFEST"
|
||||
sync_file "${TEMP_MANIFEST}" "${MANIFEST_PATH}"
|
||||
}
|
||||
firstrun=true
|
||||
while true; do
|
||||
# TODO(mark-burnett) Need to monitor a file(s) when shutting down/starting
|
||||
# up so I don't try to take two actions on the node at once.
|
||||
|
@ -67,62 +70,56 @@ while true; do
|
|||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
if [ -e /bootstrapping/{{ .Values.bootstrapping.filename }} ]; then
|
||||
# Bootstrap the first node
|
||||
copy_certificates ${ETCD_NAME}
|
||||
sync_certificates
|
||||
ETCD_INITIAL_CLUSTER=${ETCD_NAME}=https://\$\(POD_IP\):{{ .Values.network.service_peer.target_port }}
|
||||
ETCD_INITIAL_CLUSTER_STATE=new
|
||||
create_manifest $ETCD_INITIAL_CLUSTER $ETCD_INITIAL_CLUSTER_STATE
|
||||
|
||||
create_manifest "$ETCD_INITIAL_CLUSTER" "$ETCD_INITIAL_CLUSTER_STATE" "$MANIFEST_PATH"
|
||||
continue
|
||||
fi
|
||||
{{- end }}
|
||||
|
||||
sleep {{ .Values.anchor.period }}
|
||||
|
||||
if [ -e /tmp/stopped ]; then
|
||||
echo Stopping
|
||||
break
|
||||
fi
|
||||
|
||||
if [ -e /tmp/stopping ]; then
|
||||
echo Waiting to stop..
|
||||
continue
|
||||
fi
|
||||
|
||||
if [ ! -e $MANIFEST_PATH ]; then
|
||||
if ! etcdctl member list > /tmp/members; then
|
||||
echo Failed to locate existing cluster
|
||||
continue
|
||||
fi
|
||||
|
||||
etcdctl member list > /tmp/members
|
||||
# if never started or (ever started and not currently started); then
|
||||
# resync
|
||||
# fi
|
||||
if ! grep $PEER_ENDPOINT /tmp/members; then
|
||||
# If this member is not in the cluster, try to add it.
|
||||
if grep -v '\bstarted\b' /tmp/members; then
|
||||
echo Cluster does not appear fully online, waiting.
|
||||
continue
|
||||
fi
|
||||
|
||||
# Add this member to the cluster
|
||||
etcdctl member add $HOSTNAME --peer-urls $PEER_ENDPOINT
|
||||
if ! etcdctl member add $HOSTNAME --peer-urls $PEER_ENDPOINT; then
|
||||
echo Failed to add $HOSTNAME to member list. Waiting.
|
||||
continue
|
||||
fi
|
||||
|
||||
# If needed, drop the file in place
|
||||
if [ ! -e FILE ]; then
|
||||
# Refresh member list
|
||||
echo Successfully added $HOSTNAME to cluster members.
|
||||
# Refresh member list so we start with the right configuration.
|
||||
etcdctl member list > /tmp/members
|
||||
|
||||
if grep $PEER_ENDPOINT /tmp/members; then
|
||||
copy_certificates ${ETCD_NAME}
|
||||
|
||||
ETCD_INITIAL_CLUSTER=$(grep -v $PEER_ENDPOINT /tmp/members \
|
||||
| awk -F ', ' '{ print $3 "=" $4 }' \
|
||||
| tr '\n' ',' \
|
||||
| sed "s;\$;$ETCD_NAME=https://\$\(POD_IP\):{{ .Values.network.service_peer.target_port }};")
|
||||
ETCD_INITIAL_CLUSTER_STATE=existing
|
||||
|
||||
create_manifest $ETCD_INITIAL_CLUSTER $ETCD_INITIAL_CLUSTER_STATE
|
||||
fi
|
||||
if $firstrun; then
|
||||
sync_configuration /tmp/members
|
||||
firstrun=false
|
||||
fi
|
||||
if ! ETCDCTL_ENDPOINTS=$CLIENT_ENDPOINT etcdctl endpoint health; then
|
||||
# If not health, sleeps before checking again and then updating configs.
|
||||
echo Member is not healthy, sleeping before checking again.
|
||||
sleep {{ .Values.anchor.health_wait_period }}
|
||||
if ! ETCDCTL_ENDPOINTS=$CLIENT_ENDPOINT etcdctl endpoint health; then
|
||||
# If still not healthy updates the configs.
|
||||
echo Member is not healthy, syncing configurations.
|
||||
sync_configuration /tmp/members
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
|
|
@ -32,6 +32,7 @@ anchor:
|
|||
manifest_path: /etc/kubernetes/manifests
|
||||
|
||||
period: 15
|
||||
health_wait_period: 60
|
||||
|
||||
etcd:
|
||||
host_etc_path: /etc/etcd-example
|
||||
|
|
|
@ -47,7 +47,7 @@
|
|||
"-l", "kubernetes-etcd=enabled",
|
||||
"-l", "kubernetes-scheduler=enabled",
|
||||
"-l", "ucp-control-plane=enabled",
|
||||
"-e", "kubernetes n0 genesis n1 n2 n3",
|
||||
"-e", "kubernetes n0 n0 n1 n2 n3",
|
||||
"-e", "calico n0 n0 n1 n2 n3"
|
||||
]
|
||||
},
|
||||
|
|
Loading…
Reference in New Issue