Allow etcd anchor to recover from bad state

- If an etcd member has corrupted data or has somehow
  been removed from a cluster, the anchor does not currently
  recover. This change adds a threshold of X monitoring loops
  after which the anchor will remove the member from the cluster
  and recreate it.

Note: This is safe due to etcd's strict quorum checking on
      runtime reconfiguration, see [0].

[0] https://github.com/etcd-io/etcd/blob/master/Documentation/op-guide/configuration.md#--strict-reconfig-check

Change-Id: Id2ceea7393c46bed9fa5e3ead37014e52c91eac3
This commit is contained in:
Hussey, Scott (sh8121) 2019-06-25 09:47:00 -05:00
parent 59c27d76ab
commit d2f020fbb7
2 changed files with 32 additions and 4 deletions

View File

@ -46,7 +46,16 @@ function sync_configuration {
sync_file "${TEMP_MANIFEST}" "${MANIFEST_PATH}"
chmod go-rwx "${MANIFEST_PATH}"
}
function cleanup_host {
rm -f $MANIFEST_PATH
rm -rf /etcd-etc/tls/
rm -rf /etcd-data/*
firstrun=true
}
firstrun=true
saddness_duration=0
while true; do
# TODO(mark-burnett) Need to monitor a file(s) when shutting down/starting
# up so I don't try to take two actions on the node at once.
@ -91,9 +100,6 @@ while true; do
continue
fi
etcdctl member list > /tmp/members
# if never started or (ever started and not currently started); then
# resync
# fi
if ! grep $PEER_ENDPOINT /tmp/members; then
# If this member is not in the cluster, try to add it.
if grep -v '\bstarted\b' /tmp/members; then
@ -108,6 +114,22 @@ while true; do
echo Successfully added $HOSTNAME to cluster members.
# Refresh member list so we start with the right configuration.
etcdctl member list > /tmp/members
elif grep $PEER_ENDPOINT /tmp/members | grep '\bunstarted\b'; then
# This member is in the cluster but not started
if [ $saddness_duration -ge {{ .Values.anchor.saddness_threshold }} ]
then
# We have surpassed the sadness duration, remove the member and try re-adding
memberid=$(grep $PEER_ENDPOINT /tmp/members | awk -F ',' '{print $1}')
echo "Removing $memberid from etcd cluster to recreate."
if etcdctl member remove "$memberid"; then
cleanup_host
else
echo "ERROR: Attempted recreate member and failed!!!"
fi
continue
else
saddness_duration=$(($saddness_duration+1))
fi
fi
if $firstrun; then
sync_configuration /tmp/members
@ -122,6 +144,10 @@ while true; do
echo Member is not healthy, syncing configurations.
sync_configuration /tmp/members
continue
else
saddness_duration=0
fi
else
saddness_duration=0
fi
done

View File

@ -28,7 +28,9 @@ anchor:
enable_cleanup: true
etcdctl_endpoint: example-etcd
host_data_path: /var/lib/etcd/example
# How many monitoring loops the anchor goes through with an unhealthy member
# before removing the member from the cluster and recreating
saddness_threshold: 3
kubelet:
manifest_path: /etc/kubernetes/manifests