Allow etcd anchor to recover from bad state

- If an etcd member has corrupted data or has somehow
  been removed from a cluster, the anchor does not currently
  recover. This change adds a threshold of X monitoring loops
  after which the anchor will remove the member from the cluster
  and recreate it.

Note: This is safe due to etcd's strict quorum checking on
      runtime reconfiguration, see [0].

[0] https://github.com/etcd-io/etcd/blob/master/Documentation/op-guide/configuration.md#--strict-reconfig-check

Change-Id: Id2ceea7393c46bed9fa5e3ead37014e52c91eac3
This commit is contained in:
Hussey, Scott (sh8121) 2019-06-25 09:47:00 -05:00
parent 59c27d76ab
commit d2f020fbb7
2 changed files with 32 additions and 4 deletions

View File

@ -46,7 +46,16 @@ function sync_configuration {
sync_file "${TEMP_MANIFEST}" "${MANIFEST_PATH}" sync_file "${TEMP_MANIFEST}" "${MANIFEST_PATH}"
chmod go-rwx "${MANIFEST_PATH}" chmod go-rwx "${MANIFEST_PATH}"
} }
function cleanup_host {
rm -f $MANIFEST_PATH
rm -rf /etcd-etc/tls/
rm -rf /etcd-data/*
firstrun=true
}
firstrun=true firstrun=true
saddness_duration=0
while true; do while true; do
# TODO(mark-burnett) Need to monitor a file(s) when shutting down/starting # TODO(mark-burnett) Need to monitor a file(s) when shutting down/starting
# up so I don't try to take two actions on the node at once. # up so I don't try to take two actions on the node at once.
@ -91,9 +100,6 @@ while true; do
continue continue
fi fi
etcdctl member list > /tmp/members etcdctl member list > /tmp/members
# if never started or (ever started and not currently started); then
# resync
# fi
if ! grep $PEER_ENDPOINT /tmp/members; then if ! grep $PEER_ENDPOINT /tmp/members; then
# If this member is not in the cluster, try to add it. # If this member is not in the cluster, try to add it.
if grep -v '\bstarted\b' /tmp/members; then if grep -v '\bstarted\b' /tmp/members; then
@ -108,6 +114,22 @@ while true; do
echo Successfully added $HOSTNAME to cluster members. echo Successfully added $HOSTNAME to cluster members.
# Refresh member list so we start with the right configuration. # Refresh member list so we start with the right configuration.
etcdctl member list > /tmp/members etcdctl member list > /tmp/members
elif grep $PEER_ENDPOINT /tmp/members | grep '\bunstarted\b'; then
# This member is in the cluster but not started
if [ $saddness_duration -ge {{ .Values.anchor.saddness_threshold }} ]
then
# We have surpassed the sadness duration, remove the member and try re-adding
memberid=$(grep $PEER_ENDPOINT /tmp/members | awk -F ',' '{print $1}')
echo "Removing $memberid from etcd cluster to recreate."
if etcdctl member remove "$memberid"; then
cleanup_host
else
echo "ERROR: Attempted recreate member and failed!!!"
fi
continue
else
saddness_duration=$(($saddness_duration+1))
fi
fi fi
if $firstrun; then if $firstrun; then
sync_configuration /tmp/members sync_configuration /tmp/members
@ -122,6 +144,10 @@ while true; do
echo Member is not healthy, syncing configurations. echo Member is not healthy, syncing configurations.
sync_configuration /tmp/members sync_configuration /tmp/members
continue continue
else
saddness_duration=0
fi fi
else
saddness_duration=0
fi fi
done done

View File

@ -28,7 +28,9 @@ anchor:
enable_cleanup: true enable_cleanup: true
etcdctl_endpoint: example-etcd etcdctl_endpoint: example-etcd
host_data_path: /var/lib/etcd/example host_data_path: /var/lib/etcd/example
# How many monitoring loops the anchor goes through with an unhealthy member
# before removing the member from the cluster and recreating
saddness_threshold: 3
kubelet: kubelet:
manifest_path: /etc/kubernetes/manifests manifest_path: /etc/kubernetes/manifests