From 4ed467e512ac1d95f577e44324ed04d68b07c183 Mon Sep 17 00:00:00 2001
From: Craig Anderson <craig.cigar.anderson@gmail.com>
Date: Tue, 11 Dec 2018 06:53:54 +0000
Subject: [PATCH] Add retry/rerun support for exec module

Add support for retries and reruns at specified intervals for
divingbell-exec scripts. Also adds support for timeouts.

Also update osh-infra-upgrade-host to allow gate to run.

Change-Id: I5f4cd43b13a467d94f67b358f3190f515256ae66
---
 divingbell/templates/bin/_exec.sh.tpl         | 101 ++++++++++++---
 doc/source/index.rst                          |  36 +++---
 .../playbooks/osh-infra-upgrade-host.yaml     |  12 ++
 tools/gate/scripts/020-test-divingbell.sh     | 117 ++++++++++++++++++
 4 files changed, 227 insertions(+), 39 deletions(-)

diff --git a/divingbell/templates/bin/_exec.sh.tpl b/divingbell/templates/bin/_exec.sh.tpl
index a042d55..41924f0 100644
--- a/divingbell/templates/bin/_exec.sh.tpl
+++ b/divingbell/templates/bin/_exec.sh.tpl
@@ -16,6 +16,8 @@
 # limitations under the License.
 */}}
 
+{{- $exec_loop_sleep_interval := 60 }}
+
 set -e
 
 cat <<'UNIQUE_EOF_9c341059-25a0-4725-9489-1789e255e381' > {{ .Values.conf.chroot_mnt_path | quote }}/tmp/exec_host_{{ .Chart.Version }}.sh
@@ -74,19 +76,28 @@ cd "${exec_path}"
       {{- $_ := set $.Values "__blocking_policy" $keypath.blocking_policy }}
     {{- end }}
 
-    {{- $_ := set $.Values "__timeout" 3600 }}
+    {{- $_ := set $.Values "__timeout" 1800 }}
     {{- if hasKey $keypath "timeout" }}
-      {{- fail (print "NOT IMPLEMENTED: 'timeout' FOR '" $script "'") }}
+      {{- if eq ($keypath.timeout | toString) "infinite" }}
+        {{- fail (print "BAD 'timeout' FOR '" $script "': 'infinite' timeouts not supported.") }}
+      {{- end }}
       {{- $_ := set $.Values "__timeout" $keypath.timeout }}
     {{- end }}
 
     {{- $_ := set $.Values "__rerun_interval" "infinite" }}
     {{- if hasKey $keypath "rerun_interval" }}
-      {{- fail (print "NOT IMPLEMENTED: 'rerun_interval' FOR '" $script "'") }}
+      {{- if not (eq ($keypath.rerun_interval | toString) "infinity") }}
+        {{- if lt ($keypath.rerun_interval | int) $exec_loop_sleep_interval }}
+          {{- fail (print "BAD 'rerun_interval' FOR '" $script "': Got '" $keypath.rerun_interval "', but expected >= '" $exec_loop_sleep_interval "'.") }}
+        {{- end }}
+        {{- if not (eq $.Values.__rerun_policy "always") }}
+          {{- fail (print "BAD COMBINATION: Must use 'rerun_policy' of 'always' when defining a finite 'rerun_interval'. Got 'rerun_policy' of '" $.Values.__rerun_policy "' and 'rerun_interval' of '" $keypath.rerun_interval "' for '" $script "'.") }}
+        {{- end }}
+      {{- end }}
       {{- $_ := set $.Values "__rerun_interval" $keypath.rerun_interval }}
     {{- end }}
 
-    {{- $_ := set $.Values "__rerun_interval_persist" "false" }}
+    {{- $_ := set $.Values "__rerun_interval_persist" "true" }}
     {{- if hasKey $keypath "rerun_interval_persist" }}
       {{- fail (print "NOT IMPLEMENTED: 'rerun_interval_persist' FOR '" $script "'") }}
       {{- $_ := set $.Values "__rerun_interval_persist" $keypath.rerun_interval_persist }}
@@ -98,13 +109,20 @@ cd "${exec_path}"
       {{- $_ := set $.Values "__rerun_max_count" $keypath.rerun_max_count }}
     {{- end }}
 
-    {{- $_ := set $.Values "__retry_interval" $.Values.__rerun_interval }}
+    {{- $_ := set $.Values "__retry_interval" (print $.Values.__rerun_interval) }}
     {{- if hasKey $keypath "retry_interval" }}
-      {{- fail (print "NOT IMPLEMENTED: 'retry_interval' FOR '" $script "'") }}
+      {{- if not (eq ($keypath.retry_interval | toString) "infinity") }}
+        {{- if lt ($keypath.retry_interval | int) $exec_loop_sleep_interval }}
+          {{- fail (print "BAD 'retry_interval' FOR '" $script "': Got '" $keypath.retry_interval "', but expected >= '" $exec_loop_sleep_interval "'.") }}
+        {{- end }}
+        {{- if and (not (eq $.Values.__rerun_policy "always")) (not (eq $.Values.__rerun_policy "once_successfully")) }}
+          {{- fail (print "BAD COMBINATION: Must use 'rerun_policy' of 'always' or 'once_successfully' when defining a finite 'retry_interval'. Got 'rerun_policy' of '" $.Values.__rerun_policy "' and 'retry_interval' of '" $keypath.retry_interval "' for '" $script "'.") }}
+        {{- end }}
+      {{- end }}
       {{- $_ := set $.Values "__retry_interval" $keypath.retry_interval }}
     {{- end }}
 
-    {{- $_ := set $.Values "__retry_interval_persist" "false" }}
+    {{- $_ := set $.Values "__retry_interval_persist" "true" }}
     {{- if hasKey $keypath "retry_interval_persist" }}
       {{- fail (print "NOT IMPLEMENTED: 'retry_interval_persist' FOR '" $script "'") }}
       {{- $_ := set $.Values "__retry_interval_persist" $keypath.retry_interval_persist }}
@@ -115,15 +133,43 @@ cd "${exec_path}"
       {{- fail (print "NOT IMPLEMENTED: 'retry_max_count' FOR '" $script "'") }}
       {{- $_ := set $.Values "__retry_max_count" $keypath.retry_max_count }}
     {{- end }}
+
     cat <<'UNIQUE_EOF_1840dbd4-09e1-4725-87f5-3b6944b80526' > {{ $script }}
 {{ $keypath.data }}
 UNIQUE_EOF_1840dbd4-09e1-4725-87f5-3b6944b80526
     chmod 700 {{ $script }}
+    # check rerun policy
+    hash_check=fail
     if  [[ {{ $.Values.__rerun_policy }} = always ]] || \
         [[ ! -f ${hash}/exit_code ]] || \
        ([[ {{ $.Values.__rerun_policy }} = once_successfully ]] && \
-          [[ -f ${hash}/exit_code ]] && \
-          [[ $(cat ${hash}/exit_code) != 0 ]]); then
+        [[ $(cat ${hash}/exit_code) != 0 ]]); then
+      hash_check=pass
+    fi
+    # check rerun/retry interval
+    interval_check=fail
+    if  [[ ! -f ${hash}/last_run_timestamp ]] || [[ ! -f ${hash}/exit_code ]]; then
+      interval_check=pass
+    elif [[ $(cat ${hash}/exit_code) = 0 ]]; then
+      if [[ {{ $.Values.__rerun_interval }} = infinite ]]; then
+        interval_check=pass
+      elif [[ $(date +"%s") -ge $(($(cat ${hash}/last_run_timestamp) + {{ $.Values.__rerun_interval }})) ]]; then
+        interval_check=pass
+      fi
+    elif [[ $(cat ${hash}/exit_code) != 0 ]]; then
+      if [[ {{ $.Values.__retry_interval }} = infinite ]]; then
+        interval_check=pass
+      elif [[ $(date +"%s") -ge $(($(cat ${hash}/last_run_timestamp) + {{ $.Values.__retry_interval }})) ]]; then
+        interval_check=pass
+      fi
+    fi
+    if [[ $hash_check = pass ]] && [[ $interval_check = pass ]]; then
+      if [[ -f ${hash}/exit_code ]]; then
+        # remove previous run record, in case this run is interrupted
+        rm ${hash}/exit_code
+      fi
+      # write timestamp at beginning of execution
+      echo $(date +"%s") > "${hash}/last_run_timestamp"
       {{- if hasKey $keypath "env" }}
         {{- range $env_key, $env_val := $keypath.env }}
           {{ $env_key }}={{ $env_val | squote }} \
@@ -135,7 +181,26 @@ UNIQUE_EOF_1840dbd4-09e1-4725-87f5-3b6944b80526
           {{ $arg | squote }} \
         {{- end }}
       {{- end }}
-      && echo 0 > "${hash}/exit_code" || echo $? > "${hash}/exit_code"
+      &
+      pid=$!
+      time_waited=0
+      sleep_interval=5
+      timeout={{ $.Values.__timeout }}
+      while true; do
+        if [[ $time_waited -ge $timeout ]]; then
+          log.ERROR "Hit '$timeout' second timeout waiting for '{{ $script }}' - terminating."
+          # ask nicely first
+          kill $pid
+          sleep 10
+          # force kill if still running
+          ps $pid > /dev/null && kill -9 $pid
+          break
+        fi
+        ps $pid > /dev/null || break
+        sleep $sleep_interval
+        time_waited=$(($time_waited + $sleep_interval))
+      done
+      wait $pid && echo 0 > "${hash}/exit_code" || echo $? > "${hash}/exit_code"
       {{- if hasKey $keypath "blocking_policy" }}
         {{- if eq $keypath.blocking_policy "foreground_halt_pod_on_failure" }}
           if [[ $(cat "${hash}/exit_code") != '0' ]]; then
@@ -144,20 +209,16 @@ UNIQUE_EOF_1840dbd4-09e1-4725-87f5-3b6944b80526
         {{- end }}
       {{- end }}
     fi
-  {{ end }}
+  {{- end }}
 {{- end }}
 
-exit 0
 UNIQUE_EOF_9c341059-25a0-4725-9489-1789e255e381
 
 chmod 700 {{ .Values.conf.chroot_mnt_path | quote }}/tmp/exec_host_{{ .Chart.Version }}.sh
-chroot {{ .Values.conf.chroot_mnt_path | quote }} /tmp/exec_host_{{ .Chart.Version }}.sh
 
-sleep 1
-echo 'INFO Putting the daemon to sleep.'
-
-while [ 1 ]; do
-  sleep 300
+while true; do
+  chroot {{ .Values.conf.chroot_mnt_path | quote }} /tmp/exec_host_{{ .Chart.Version }}.sh
+  sleep 2
+  echo 'INFO Putting the daemon to sleep.'
+  sleep {{ $exec_loop_sleep_interval }}
 done
-
-exit 0
diff --git a/doc/source/index.rst b/doc/source/index.rst
index edc297f..ec74a98 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -209,6 +209,17 @@ The following set of options are fully implemeneted::
     If any of that info changes, so will the hash, and it will be seen as a new
     object which will be executed regardless of this setting.
 
+    ``script_timeout`` may optionally be set to the number of seconds to wait for
+    script completion before termination. Default value is ``1800`` (30 min).
+
+    ``rerun_interval`` may be optionally set to the number of seconds to wait
+    between rerunning a given script which ran successfully the previous time.
+    Default value is ``infinite``.
+
+    ``retry_interval`` may be optionally set to the number of seconds to wait
+    between rerunning a given script which did not run successfully the previous
+    time. Default behavior is to match the ``rerun_interval``.
+
 The following set of options are partially implemeneted::
 
     ``blocking_policy`` may optionally be set to ``background``, ``foreground``,
@@ -223,30 +234,17 @@ The following set of options are partially implemeneted::
 
 The following set of options are not yet implemeneted::
 
-    ``script_timeout`` may optionally be set to the number of seconds to wait for
-    script completion before termination. Default value is ``3600`` (1 hour).
-
-    ``rerun_interval`` may be optionally set to the number of seconds to wait
-    between rerunning a given script which ran successfully the previous time.
-    Default value is ``infinite``.
-
-    ``rerun_interval_persist`` may be optionally set to ``true`` for
-    a given script. This allows a script to persist its rerun interval through a
-    pod/node restart. Otherwise, the time since last successful script execution
-    will not be considered on pod/node startup. Default value is ``false``.
+    ``rerun_interval_persist`` may be optionally set to ``false`` for a given
+    script. This makes the script execute on pod/node startup regardless of the
+    interval since the last successful execution. Default value is ``true``.
 
     ``rerun_max_count`` may be optionally set to the maximum number of times a
     succeeding script should be retried. Successful exec count does not persist
     through pod/node restart. Default value is ``infinite``.
 
-    ``retry_interval`` may be optionally set to the number of seconds to wait
-    between rerunning a given script which did not run successfully the previous
-    time. Default value is set to the ``rerun_interval``.
-
-    ``retry_interval_persist`` may be optionally set to ``true`` for
-    a given script. This allows a script to persist its retry interval through a
-    pod/node restart. Otherwise, the time since last failed script execution
-    will not be considered on pod/node startup. Default value is ``false``.
+    ``retry_interval_persist`` may be optionally set to ``false`` for a given
+    script. This makes the script execute on pod/node startup, regardless of the
+    time since the last execution. Default value is ``true``.
 
     ``retry_max_count`` may be optionally set to the maximum number of times a
     failing script should be retried. Failed exec count does not persist
diff --git a/tools/gate/playbooks/osh-infra-upgrade-host.yaml b/tools/gate/playbooks/osh-infra-upgrade-host.yaml
index 495b5cb..3a2b79b 100644
--- a/tools/gate/playbooks/osh-infra-upgrade-host.yaml
+++ b/tools/gate/playbooks/osh-infra-upgrade-host.yaml
@@ -39,3 +39,15 @@
     - upgrade-host
     - start-zuul-console
     - disable-local-nameserver
+
+- hosts: all
+  vars_files:
+    - vars.yaml
+  vars:
+    work_dir: "{{ zuul.project.src_dir }}/{{ zuul_osh_infra_relative_path | default('') }}"
+  gather_facts: False
+  become: yes
+  roles:
+    - deploy-apparmor
+  tags:
+    - deploy-apparmor
diff --git a/tools/gate/scripts/020-test-divingbell.sh b/tools/gate/scripts/020-test-divingbell.sh
index c408c66..a65e508 100755
--- a/tools/gate/scripts/020-test-divingbell.sh
+++ b/tools/gate/scripts/020-test-divingbell.sh
@@ -1178,6 +1178,123 @@ manifests:
     echo "[SUCCESS] exec test$(($i + 5)) passed successfully" >> "${TEST_RESULTS}"
   done
 
+  # test timeout
+  local overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set17.yaml
+  echo 'conf:
+  exec:
+    011-timeout.sh:
+      timeout: 11
+      data: |
+        #!/bin/bash
+        sleep 60' > "${overrides_yaml}"
+  install_base "--values=${overrides_yaml}"
+  get_container_status exec
+  _test_clog_msg 'timeout waiting for'
+  echo '[SUCCESS] exec test17 passed successfully' >> "${TEST_RESULTS}"
+
+  # Test invalid timeout
+  overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set18.yaml
+  echo 'conf:
+  exec:
+    011-timeout.sh:
+      timeout: infinite
+      data: |
+        #!/bin/bash
+        sleep 60' > "${overrides_yaml}"
+  install_base "--values=${overrides_yaml}" 2>&1 | grep 'BAD .timeout. FOR' || \
+    (echo "[FAIL] exec test18 did not receive expected 'BAD .timeout. FOR' error" && exit 1)
+  echo '[SUCCESS] exec test18 passed successfully' >> "${TEST_RESULTS}"
+
+  # Test invalid rerun_interval (too short)
+  overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set19.yaml
+  echo 'conf:
+  exec:
+    012-rerun-interval.sh:
+      rerun_interval: 30
+      data: |
+        #!/bin/bash
+        true' > "${overrides_yaml}"
+  install_base "--values=${overrides_yaml}" 2>&1 | grep 'BAD .rerun_interval. FOR' || \
+    (echo "[FAIL] exec test19 did not receive expected 'BAD .rerun_interval. FOR' error" && exit 1)
+  echo '[SUCCESS] exec test19 passed successfully' >> "${TEST_RESULTS}"
+
+  # Test invalid retry_interval (too short)
+  overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set20.yaml
+  echo 'conf:
+  exec:
+    012-retry-interval.sh:
+      retry_interval: 30
+      data: |
+        #!/bin/bash
+        true' > "${overrides_yaml}"
+  install_base "--values=${overrides_yaml}" 2>&1 | grep 'BAD .retry_interval. FOR' || \
+    (echo "[FAIL] exec test20 did not receive expected 'BAD .retry_interval. FOR' error" && exit 1)
+  echo '[SUCCESS] exec test20 passed successfully' >> "${TEST_RESULTS}"
+
+  # Test invalid rerun_interval combination
+  overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set21.yaml
+  echo 'conf:
+  exec:
+    012-rerun-interval.sh:
+      rerun_interval: 60
+      rerun_policy: once_successfully
+      data: |
+        #!/bin/bash
+        true' > "${overrides_yaml}"
+  install_base "--values=${overrides_yaml}" 2>&1 | grep 'BAD COMBINATION' || \
+    (echo "[FAIL] exec test21 did not receive expected 'BAD COMBINATION' error" && exit 1)
+  echo '[SUCCESS] exec test21 passed successfully' >> "${TEST_RESULTS}"
+
+  # Test invalid retry_interval combination
+  overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set22.yaml
+  echo 'conf:
+  exec:
+    012-retry-interval.sh:
+      retry_interval: 60
+      rerun_policy: never
+      data: |
+        #!/bin/bash
+        true' > "${overrides_yaml}"
+  install_base "--values=${overrides_yaml}" 2>&1 | grep 'BAD COMBINATION' || \
+    (echo "[FAIL] exec test22 did not receive expected 'BAD COMBINATION' error" && exit 1)
+  echo '[SUCCESS] exec test22 passed successfully' >> "${TEST_RESULTS}"
+
+  # test rerun_interval
+  overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set23.yaml
+  echo 'conf:
+  exec:
+    012-rerun-interval.sh:
+      rerun_interval: 60
+      data: |
+        #!/bin/bash
+        echo script name: ${BASH_SOURCE} >> exec_testfile' > "${overrides_yaml}"
+  install_base "--values=${overrides_yaml}"
+  get_container_status exec
+  sleep 72
+  get_container_status exec
+  expected_result='script name: ./012-rerun-interval.sh
+script name: ./012-rerun-interval.sh'
+  _test_exec_match "$expected_result" "${EXEC_DIR}/exec_testfile" "test23"
+  echo '[SUCCESS] exec test23 passed successfully' >> "${TEST_RESULTS}"
+
+  # test retry_interval
+  overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set24.yaml
+  echo 'conf:
+  exec:
+    012-retry-interval.sh:
+      retry_interval: 60
+      data: |
+        #!/bin/bash
+        echo script name: ${BASH_SOURCE} >> exec_testfile
+        false' > "${overrides_yaml}"
+  install_base "--values=${overrides_yaml}"
+  get_container_status exec
+  sleep 72
+  get_container_status exec
+  expected_result='script name: ./012-retry-interval.sh
+script name: ./012-retry-interval.sh'
+  _test_exec_match "$expected_result" "${EXEC_DIR}/exec_testfile" "test24"
+  echo '[SUCCESS] exec test24 passed successfully' >> "${TEST_RESULTS}"
 }
 
 # test daemonset value overrides for hosts and labels