From d2f020fbb7dde804ecbe6261bfe0098a18f7b13b Mon Sep 17 00:00:00 2001 From: "Hussey, Scott (sh8121)" Date: Tue, 25 Jun 2019 09:47:00 -0500 Subject: [PATCH] Allow etcd anchor to recover from bad state - If an etcd member has corrupted data or has somehow been removed from a cluster, the anchor does not currently recover. This change adds a threshold of X monitoring loops after which the anchor will remove the member from the cluster and recreate it. Note: This is safe due to etcd's strict quorum checking on runtime reconfiguration, see [0]. [0] https://github.com/etcd-io/etcd/blob/master/Documentation/op-guide/configuration.md#--strict-reconfig-check Change-Id: Id2ceea7393c46bed9fa5e3ead37014e52c91eac3 --- charts/etcd/templates/bin/_etcdctl_anchor.tpl | 32 +++++++++++++++++-- charts/etcd/values.yaml | 4 ++- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/charts/etcd/templates/bin/_etcdctl_anchor.tpl b/charts/etcd/templates/bin/_etcdctl_anchor.tpl index 6f458d75..7a19f3e3 100644 --- a/charts/etcd/templates/bin/_etcdctl_anchor.tpl +++ b/charts/etcd/templates/bin/_etcdctl_anchor.tpl @@ -46,7 +46,16 @@ function sync_configuration { sync_file "${TEMP_MANIFEST}" "${MANIFEST_PATH}" chmod go-rwx "${MANIFEST_PATH}" } + +function cleanup_host { + rm -f $MANIFEST_PATH + rm -rf /etcd-etc/tls/ + rm -rf /etcd-data/* + firstrun=true +} + firstrun=true +saddness_duration=0 while true; do # TODO(mark-burnett) Need to monitor a file(s) when shutting down/starting # up so I don't try to take two actions on the node at once. @@ -91,9 +100,6 @@ while true; do continue fi etcdctl member list > /tmp/members - # if never started or (ever started and not currently started); then - # resync - # fi if ! grep $PEER_ENDPOINT /tmp/members; then # If this member is not in the cluster, try to add it. if grep -v '\bstarted\b' /tmp/members; then @@ -108,6 +114,22 @@ while true; do echo Successfully added $HOSTNAME to cluster members. # Refresh member list so we start with the right configuration. etcdctl member list > /tmp/members + elif grep $PEER_ENDPOINT /tmp/members | grep '\bunstarted\b'; then + # This member is in the cluster but not started + if [ $saddness_duration -ge {{ .Values.anchor.saddness_threshold }} ] + then + # We have surpassed the sadness duration, remove the member and try re-adding + memberid=$(grep $PEER_ENDPOINT /tmp/members | awk -F ',' '{print $1}') + echo "Removing $memberid from etcd cluster to recreate." + if etcdctl member remove "$memberid"; then + cleanup_host + else + echo "ERROR: Attempted recreate member and failed!!!" + fi + continue + else + saddness_duration=$(($saddness_duration+1)) + fi fi if $firstrun; then sync_configuration /tmp/members @@ -122,6 +144,10 @@ while true; do echo Member is not healthy, syncing configurations. sync_configuration /tmp/members continue + else + saddness_duration=0 fi + else + saddness_duration=0 fi done diff --git a/charts/etcd/values.yaml b/charts/etcd/values.yaml index 7a00b634..946bc4b6 100644 --- a/charts/etcd/values.yaml +++ b/charts/etcd/values.yaml @@ -28,7 +28,9 @@ anchor: enable_cleanup: true etcdctl_endpoint: example-etcd host_data_path: /var/lib/etcd/example - + # How many monitoring loops the anchor goes through with an unhealthy member + # before removing the member from the cluster and recreating + saddness_threshold: 3 kubelet: manifest_path: /etc/kubernetes/manifests