From d2f020fbb7dde804ecbe6261bfe0098a18f7b13b Mon Sep 17 00:00:00 2001
From: "Hussey, Scott (sh8121)" <sh8121@att.com>
Date: Tue, 25 Jun 2019 09:47:00 -0500
Subject: [PATCH] Allow etcd anchor to recover from bad state

- If an etcd member has corrupted data or has somehow
  been removed from a cluster, the anchor does not currently
  recover. This change adds a threshold of X monitoring loops
  after which the anchor will remove the member from the cluster
  and recreate it.

Note: This is safe due to etcd's strict quorum checking on
      runtime reconfiguration, see [0].

[0] https://github.com/etcd-io/etcd/blob/master/Documentation/op-guide/configuration.md#--strict-reconfig-check

Change-Id: Id2ceea7393c46bed9fa5e3ead37014e52c91eac3
---
 charts/etcd/templates/bin/_etcdctl_anchor.tpl | 32 +++++++++++++++++--
 charts/etcd/values.yaml                       |  4 ++-
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/charts/etcd/templates/bin/_etcdctl_anchor.tpl b/charts/etcd/templates/bin/_etcdctl_anchor.tpl
index 6f458d75..7a19f3e3 100644
--- a/charts/etcd/templates/bin/_etcdctl_anchor.tpl
+++ b/charts/etcd/templates/bin/_etcdctl_anchor.tpl
@@ -46,7 +46,16 @@ function sync_configuration {
     sync_file "${TEMP_MANIFEST}" "${MANIFEST_PATH}"
     chmod go-rwx "${MANIFEST_PATH}"
 }
+
+function cleanup_host {
+    rm -f $MANIFEST_PATH
+    rm -rf /etcd-etc/tls/
+    rm -rf /etcd-data/*
+    firstrun=true
+}
+
 firstrun=true
+saddness_duration=0
 while true; do
     # TODO(mark-burnett) Need to monitor a file(s) when shutting down/starting
     # up so I don't try to take two actions on the node at once.
@@ -91,9 +100,6 @@ while true; do
         continue
     fi
     etcdctl member list > /tmp/members
-    # if never started or (ever started and not currently started); then
-    #   resync
-    # fi
     if ! grep $PEER_ENDPOINT /tmp/members; then
         # If this member is not in the cluster, try to add it.
         if grep -v '\bstarted\b' /tmp/members; then
@@ -108,6 +114,22 @@ while true; do
         echo Successfully added $HOSTNAME to cluster members.
         # Refresh member list so we start with the right configuration.
         etcdctl member list > /tmp/members
+    elif grep $PEER_ENDPOINT /tmp/members | grep '\bunstarted\b'; then
+        # This member is in the cluster but not started
+        if [ $saddness_duration -ge {{ .Values.anchor.saddness_threshold }} ]
+        then
+          # We have surpassed the sadness duration, remove the member and try re-adding
+          memberid=$(grep $PEER_ENDPOINT /tmp/members | awk -F ',' '{print $1}')
+          echo "Removing $memberid from etcd cluster to recreate."
+          if etcdctl member remove "$memberid"; then
+            cleanup_host
+          else
+            echo "ERROR: Attempted recreate member and failed!!!"
+          fi
+          continue
+        else
+          saddness_duration=$(($saddness_duration+1))
+        fi
     fi
     if $firstrun; then
         sync_configuration /tmp/members
@@ -122,6 +144,10 @@ while true; do
             echo Member is not healthy, syncing configurations.
             sync_configuration /tmp/members
             continue
+        else
+          saddness_duration=0
         fi
+    else
+      saddness_duration=0
     fi
 done
diff --git a/charts/etcd/values.yaml b/charts/etcd/values.yaml
index 7a00b634..946bc4b6 100644
--- a/charts/etcd/values.yaml
+++ b/charts/etcd/values.yaml
@@ -28,7 +28,9 @@ anchor:
   enable_cleanup: true
   etcdctl_endpoint: example-etcd
   host_data_path: /var/lib/etcd/example
-
+  # How many monitoring loops the anchor goes through with an unhealthy member
+  # before removing the member from the cluster and recreating
+  saddness_threshold: 3
   kubelet:
     manifest_path: /etc/kubernetes/manifests