From 4ed467e512ac1d95f577e44324ed04d68b07c183 Mon Sep 17 00:00:00 2001 From: Craig Anderson Date: Tue, 11 Dec 2018 06:53:54 +0000 Subject: [PATCH] Add retry/rerun support for exec module Add support for retries and reruns at specified intervals for divingbell-exec scripts. Also adds support for timeouts. Also update osh-infra-upgrade-host to allow gate to run. Change-Id: I5f4cd43b13a467d94f67b358f3190f515256ae66 --- divingbell/templates/bin/_exec.sh.tpl | 101 ++++++++++++--- doc/source/index.rst | 36 +++--- .../playbooks/osh-infra-upgrade-host.yaml | 12 ++ tools/gate/scripts/020-test-divingbell.sh | 117 ++++++++++++++++++ 4 files changed, 227 insertions(+), 39 deletions(-) diff --git a/divingbell/templates/bin/_exec.sh.tpl b/divingbell/templates/bin/_exec.sh.tpl index a042d55..41924f0 100644 --- a/divingbell/templates/bin/_exec.sh.tpl +++ b/divingbell/templates/bin/_exec.sh.tpl @@ -16,6 +16,8 @@ # limitations under the License. */}} +{{- $exec_loop_sleep_interval := 60 }} + set -e cat <<'UNIQUE_EOF_9c341059-25a0-4725-9489-1789e255e381' > {{ .Values.conf.chroot_mnt_path | quote }}/tmp/exec_host_{{ .Chart.Version }}.sh @@ -74,19 +76,28 @@ cd "${exec_path}" {{- $_ := set $.Values "__blocking_policy" $keypath.blocking_policy }} {{- end }} - {{- $_ := set $.Values "__timeout" 3600 }} + {{- $_ := set $.Values "__timeout" 1800 }} {{- if hasKey $keypath "timeout" }} - {{- fail (print "NOT IMPLEMENTED: 'timeout' FOR '" $script "'") }} + {{- if eq ($keypath.timeout | toString) "infinite" }} + {{- fail (print "BAD 'timeout' FOR '" $script "': 'infinite' timeouts not supported.") }} + {{- end }} {{- $_ := set $.Values "__timeout" $keypath.timeout }} {{- end }} {{- $_ := set $.Values "__rerun_interval" "infinite" }} {{- if hasKey $keypath "rerun_interval" }} - {{- fail (print "NOT IMPLEMENTED: 'rerun_interval' FOR '" $script "'") }} + {{- if not (eq ($keypath.rerun_interval | toString) "infinity") }} + {{- if lt ($keypath.rerun_interval | int) $exec_loop_sleep_interval }} + {{- fail (print "BAD 'rerun_interval' FOR '" $script "': Got '" $keypath.rerun_interval "', but expected >= '" $exec_loop_sleep_interval "'.") }} + {{- end }} + {{- if not (eq $.Values.__rerun_policy "always") }} + {{- fail (print "BAD COMBINATION: Must use 'rerun_policy' of 'always' when defining a finite 'rerun_interval'. Got 'rerun_policy' of '" $.Values.__rerun_policy "' and 'rerun_interval' of '" $keypath.rerun_interval "' for '" $script "'.") }} + {{- end }} + {{- end }} {{- $_ := set $.Values "__rerun_interval" $keypath.rerun_interval }} {{- end }} - {{- $_ := set $.Values "__rerun_interval_persist" "false" }} + {{- $_ := set $.Values "__rerun_interval_persist" "true" }} {{- if hasKey $keypath "rerun_interval_persist" }} {{- fail (print "NOT IMPLEMENTED: 'rerun_interval_persist' FOR '" $script "'") }} {{- $_ := set $.Values "__rerun_interval_persist" $keypath.rerun_interval_persist }} @@ -98,13 +109,20 @@ cd "${exec_path}" {{- $_ := set $.Values "__rerun_max_count" $keypath.rerun_max_count }} {{- end }} - {{- $_ := set $.Values "__retry_interval" $.Values.__rerun_interval }} + {{- $_ := set $.Values "__retry_interval" (print $.Values.__rerun_interval) }} {{- if hasKey $keypath "retry_interval" }} - {{- fail (print "NOT IMPLEMENTED: 'retry_interval' FOR '" $script "'") }} + {{- if not (eq ($keypath.retry_interval | toString) "infinity") }} + {{- if lt ($keypath.retry_interval | int) $exec_loop_sleep_interval }} + {{- fail (print "BAD 'retry_interval' FOR '" $script "': Got '" $keypath.retry_interval "', but expected >= '" $exec_loop_sleep_interval "'.") }} + {{- end }} + {{- if and (not (eq $.Values.__rerun_policy "always")) (not (eq $.Values.__rerun_policy "once_successfully")) }} + {{- fail (print "BAD COMBINATION: Must use 'rerun_policy' of 'always' or 'once_successfully' when defining a finite 'retry_interval'. Got 'rerun_policy' of '" $.Values.__rerun_policy "' and 'retry_interval' of '" $keypath.retry_interval "' for '" $script "'.") }} + {{- end }} + {{- end }} {{- $_ := set $.Values "__retry_interval" $keypath.retry_interval }} {{- end }} - {{- $_ := set $.Values "__retry_interval_persist" "false" }} + {{- $_ := set $.Values "__retry_interval_persist" "true" }} {{- if hasKey $keypath "retry_interval_persist" }} {{- fail (print "NOT IMPLEMENTED: 'retry_interval_persist' FOR '" $script "'") }} {{- $_ := set $.Values "__retry_interval_persist" $keypath.retry_interval_persist }} @@ -115,15 +133,43 @@ cd "${exec_path}" {{- fail (print "NOT IMPLEMENTED: 'retry_max_count' FOR '" $script "'") }} {{- $_ := set $.Values "__retry_max_count" $keypath.retry_max_count }} {{- end }} + cat <<'UNIQUE_EOF_1840dbd4-09e1-4725-87f5-3b6944b80526' > {{ $script }} {{ $keypath.data }} UNIQUE_EOF_1840dbd4-09e1-4725-87f5-3b6944b80526 chmod 700 {{ $script }} + # check rerun policy + hash_check=fail if [[ {{ $.Values.__rerun_policy }} = always ]] || \ [[ ! -f ${hash}/exit_code ]] || \ ([[ {{ $.Values.__rerun_policy }} = once_successfully ]] && \ - [[ -f ${hash}/exit_code ]] && \ - [[ $(cat ${hash}/exit_code) != 0 ]]); then + [[ $(cat ${hash}/exit_code) != 0 ]]); then + hash_check=pass + fi + # check rerun/retry interval + interval_check=fail + if [[ ! -f ${hash}/last_run_timestamp ]] || [[ ! -f ${hash}/exit_code ]]; then + interval_check=pass + elif [[ $(cat ${hash}/exit_code) = 0 ]]; then + if [[ {{ $.Values.__rerun_interval }} = infinite ]]; then + interval_check=pass + elif [[ $(date +"%s") -ge $(($(cat ${hash}/last_run_timestamp) + {{ $.Values.__rerun_interval }})) ]]; then + interval_check=pass + fi + elif [[ $(cat ${hash}/exit_code) != 0 ]]; then + if [[ {{ $.Values.__retry_interval }} = infinite ]]; then + interval_check=pass + elif [[ $(date +"%s") -ge $(($(cat ${hash}/last_run_timestamp) + {{ $.Values.__retry_interval }})) ]]; then + interval_check=pass + fi + fi + if [[ $hash_check = pass ]] && [[ $interval_check = pass ]]; then + if [[ -f ${hash}/exit_code ]]; then + # remove previous run record, in case this run is interrupted + rm ${hash}/exit_code + fi + # write timestamp at beginning of execution + echo $(date +"%s") > "${hash}/last_run_timestamp" {{- if hasKey $keypath "env" }} {{- range $env_key, $env_val := $keypath.env }} {{ $env_key }}={{ $env_val | squote }} \ @@ -135,7 +181,26 @@ UNIQUE_EOF_1840dbd4-09e1-4725-87f5-3b6944b80526 {{ $arg | squote }} \ {{- end }} {{- end }} - && echo 0 > "${hash}/exit_code" || echo $? > "${hash}/exit_code" + & + pid=$! + time_waited=0 + sleep_interval=5 + timeout={{ $.Values.__timeout }} + while true; do + if [[ $time_waited -ge $timeout ]]; then + log.ERROR "Hit '$timeout' second timeout waiting for '{{ $script }}' - terminating." + # ask nicely first + kill $pid + sleep 10 + # force kill if still running + ps $pid > /dev/null && kill -9 $pid + break + fi + ps $pid > /dev/null || break + sleep $sleep_interval + time_waited=$(($time_waited + $sleep_interval)) + done + wait $pid && echo 0 > "${hash}/exit_code" || echo $? > "${hash}/exit_code" {{- if hasKey $keypath "blocking_policy" }} {{- if eq $keypath.blocking_policy "foreground_halt_pod_on_failure" }} if [[ $(cat "${hash}/exit_code") != '0' ]]; then @@ -144,20 +209,16 @@ UNIQUE_EOF_1840dbd4-09e1-4725-87f5-3b6944b80526 {{- end }} {{- end }} fi - {{ end }} + {{- end }} {{- end }} -exit 0 UNIQUE_EOF_9c341059-25a0-4725-9489-1789e255e381 chmod 700 {{ .Values.conf.chroot_mnt_path | quote }}/tmp/exec_host_{{ .Chart.Version }}.sh -chroot {{ .Values.conf.chroot_mnt_path | quote }} /tmp/exec_host_{{ .Chart.Version }}.sh -sleep 1 -echo 'INFO Putting the daemon to sleep.' - -while [ 1 ]; do - sleep 300 +while true; do + chroot {{ .Values.conf.chroot_mnt_path | quote }} /tmp/exec_host_{{ .Chart.Version }}.sh + sleep 2 + echo 'INFO Putting the daemon to sleep.' + sleep {{ $exec_loop_sleep_interval }} done - -exit 0 diff --git a/doc/source/index.rst b/doc/source/index.rst index edc297f..ec74a98 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -209,6 +209,17 @@ The following set of options are fully implemeneted:: If any of that info changes, so will the hash, and it will be seen as a new object which will be executed regardless of this setting. + ``script_timeout`` may optionally be set to the number of seconds to wait for + script completion before termination. Default value is ``1800`` (30 min). + + ``rerun_interval`` may be optionally set to the number of seconds to wait + between rerunning a given script which ran successfully the previous time. + Default value is ``infinite``. + + ``retry_interval`` may be optionally set to the number of seconds to wait + between rerunning a given script which did not run successfully the previous + time. Default behavior is to match the ``rerun_interval``. + The following set of options are partially implemeneted:: ``blocking_policy`` may optionally be set to ``background``, ``foreground``, @@ -223,30 +234,17 @@ The following set of options are partially implemeneted:: The following set of options are not yet implemeneted:: - ``script_timeout`` may optionally be set to the number of seconds to wait for - script completion before termination. Default value is ``3600`` (1 hour). - - ``rerun_interval`` may be optionally set to the number of seconds to wait - between rerunning a given script which ran successfully the previous time. - Default value is ``infinite``. - - ``rerun_interval_persist`` may be optionally set to ``true`` for - a given script. This allows a script to persist its rerun interval through a - pod/node restart. Otherwise, the time since last successful script execution - will not be considered on pod/node startup. Default value is ``false``. + ``rerun_interval_persist`` may be optionally set to ``false`` for a given + script. This makes the script execute on pod/node startup regardless of the + interval since the last successful execution. Default value is ``true``. ``rerun_max_count`` may be optionally set to the maximum number of times a succeeding script should be retried. Successful exec count does not persist through pod/node restart. Default value is ``infinite``. - ``retry_interval`` may be optionally set to the number of seconds to wait - between rerunning a given script which did not run successfully the previous - time. Default value is set to the ``rerun_interval``. - - ``retry_interval_persist`` may be optionally set to ``true`` for - a given script. This allows a script to persist its retry interval through a - pod/node restart. Otherwise, the time since last failed script execution - will not be considered on pod/node startup. Default value is ``false``. + ``retry_interval_persist`` may be optionally set to ``false`` for a given + script. This makes the script execute on pod/node startup, regardless of the + time since the last execution. Default value is ``true``. ``retry_max_count`` may be optionally set to the maximum number of times a failing script should be retried. Failed exec count does not persist diff --git a/tools/gate/playbooks/osh-infra-upgrade-host.yaml b/tools/gate/playbooks/osh-infra-upgrade-host.yaml index 495b5cb..3a2b79b 100644 --- a/tools/gate/playbooks/osh-infra-upgrade-host.yaml +++ b/tools/gate/playbooks/osh-infra-upgrade-host.yaml @@ -39,3 +39,15 @@ - upgrade-host - start-zuul-console - disable-local-nameserver + +- hosts: all + vars_files: + - vars.yaml + vars: + work_dir: "{{ zuul.project.src_dir }}/{{ zuul_osh_infra_relative_path | default('') }}" + gather_facts: False + become: yes + roles: + - deploy-apparmor + tags: + - deploy-apparmor diff --git a/tools/gate/scripts/020-test-divingbell.sh b/tools/gate/scripts/020-test-divingbell.sh index c408c66..a65e508 100755 --- a/tools/gate/scripts/020-test-divingbell.sh +++ b/tools/gate/scripts/020-test-divingbell.sh @@ -1178,6 +1178,123 @@ manifests: echo "[SUCCESS] exec test$(($i + 5)) passed successfully" >> "${TEST_RESULTS}" done + # test timeout + local overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set17.yaml + echo 'conf: + exec: + 011-timeout.sh: + timeout: 11 + data: | + #!/bin/bash + sleep 60' > "${overrides_yaml}" + install_base "--values=${overrides_yaml}" + get_container_status exec + _test_clog_msg 'timeout waiting for' + echo '[SUCCESS] exec test17 passed successfully' >> "${TEST_RESULTS}" + + # Test invalid timeout + overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set18.yaml + echo 'conf: + exec: + 011-timeout.sh: + timeout: infinite + data: | + #!/bin/bash + sleep 60' > "${overrides_yaml}" + install_base "--values=${overrides_yaml}" 2>&1 | grep 'BAD .timeout. FOR' || \ + (echo "[FAIL] exec test18 did not receive expected 'BAD .timeout. FOR' error" && exit 1) + echo '[SUCCESS] exec test18 passed successfully' >> "${TEST_RESULTS}" + + # Test invalid rerun_interval (too short) + overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set19.yaml + echo 'conf: + exec: + 012-rerun-interval.sh: + rerun_interval: 30 + data: | + #!/bin/bash + true' > "${overrides_yaml}" + install_base "--values=${overrides_yaml}" 2>&1 | grep 'BAD .rerun_interval. FOR' || \ + (echo "[FAIL] exec test19 did not receive expected 'BAD .rerun_interval. FOR' error" && exit 1) + echo '[SUCCESS] exec test19 passed successfully' >> "${TEST_RESULTS}" + + # Test invalid retry_interval (too short) + overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set20.yaml + echo 'conf: + exec: + 012-retry-interval.sh: + retry_interval: 30 + data: | + #!/bin/bash + true' > "${overrides_yaml}" + install_base "--values=${overrides_yaml}" 2>&1 | grep 'BAD .retry_interval. FOR' || \ + (echo "[FAIL] exec test20 did not receive expected 'BAD .retry_interval. FOR' error" && exit 1) + echo '[SUCCESS] exec test20 passed successfully' >> "${TEST_RESULTS}" + + # Test invalid rerun_interval combination + overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set21.yaml + echo 'conf: + exec: + 012-rerun-interval.sh: + rerun_interval: 60 + rerun_policy: once_successfully + data: | + #!/bin/bash + true' > "${overrides_yaml}" + install_base "--values=${overrides_yaml}" 2>&1 | grep 'BAD COMBINATION' || \ + (echo "[FAIL] exec test21 did not receive expected 'BAD COMBINATION' error" && exit 1) + echo '[SUCCESS] exec test21 passed successfully' >> "${TEST_RESULTS}" + + # Test invalid retry_interval combination + overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set22.yaml + echo 'conf: + exec: + 012-retry-interval.sh: + retry_interval: 60 + rerun_policy: never + data: | + #!/bin/bash + true' > "${overrides_yaml}" + install_base "--values=${overrides_yaml}" 2>&1 | grep 'BAD COMBINATION' || \ + (echo "[FAIL] exec test22 did not receive expected 'BAD COMBINATION' error" && exit 1) + echo '[SUCCESS] exec test22 passed successfully' >> "${TEST_RESULTS}" + + # test rerun_interval + overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set23.yaml + echo 'conf: + exec: + 012-rerun-interval.sh: + rerun_interval: 60 + data: | + #!/bin/bash + echo script name: ${BASH_SOURCE} >> exec_testfile' > "${overrides_yaml}" + install_base "--values=${overrides_yaml}" + get_container_status exec + sleep 72 + get_container_status exec + expected_result='script name: ./012-rerun-interval.sh +script name: ./012-rerun-interval.sh' + _test_exec_match "$expected_result" "${EXEC_DIR}/exec_testfile" "test23" + echo '[SUCCESS] exec test23 passed successfully' >> "${TEST_RESULTS}" + + # test retry_interval + overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set24.yaml + echo 'conf: + exec: + 012-retry-interval.sh: + retry_interval: 60 + data: | + #!/bin/bash + echo script name: ${BASH_SOURCE} >> exec_testfile + false' > "${overrides_yaml}" + install_base "--values=${overrides_yaml}" + get_container_status exec + sleep 72 + get_container_status exec + expected_result='script name: ./012-retry-interval.sh +script name: ./012-retry-interval.sh' + _test_exec_match "$expected_result" "${EXEC_DIR}/exec_testfile" "test24" + echo '[SUCCESS] exec test24 passed successfully' >> "${TEST_RESULTS}" } # test daemonset value overrides for hosts and labels