From 03ebbaaca3f38ab49a5993b506761a4abfd19d89 Mon Sep 17 00:00:00 2001 From: Phil Sphicas Date: Thu, 28 Nov 2019 23:03:26 -0800 Subject: [PATCH] maas-rack image: improve IPMI error handling MAAS uses ipmipower to power nodes on and off. The ipmipower command sometimes fails with the message: x.x.x.x: BMC error but exits with return code of 0. Because MAAS is not aware of this specific error, and is also not able to detect the power state from the output, the built in retry logic is not used, resulting in failed deployments or nodes in Power Error. This patch adds "BMC error" to the list of known retriable errors, and also adds additional retries. Error strings from ipmipower 1.4: http://git.savannah.gnu.org/cgit/freeipmi.git/tree/ipmipower/ipmipower_output.c?h=Release-1_4_0_branch#n52 IPMI_ERRORS known to MAAS 2.3: https://git.launchpad.net/maas/tree/src/provisioningserver/drivers/power/ipmi.py?h=2.3#n50 Change-Id: Ia4b10e47855d67ba81e9ab800be3d780f8b38cac --- .../maas-rack-controller/2.3_ipmi_error.patch | 26 +++++++++++++++++++ images/maas-rack-controller/Dockerfile | 3 +++ 2 files changed, 29 insertions(+) create mode 100644 images/maas-rack-controller/2.3_ipmi_error.patch diff --git a/images/maas-rack-controller/2.3_ipmi_error.patch b/images/maas-rack-controller/2.3_ipmi_error.patch new file mode 100644 index 0000000..703fa5f --- /dev/null +++ b/images/maas-rack-controller/2.3_ipmi_error.patch @@ -0,0 +1,26 @@ +diff --git a/src/provisioningserver/drivers/power/ipmi.py b/src/provisioningserver/drivers/power/ipmi.py +index 219ee268b..acdf2065d 100644 +--- a/src/provisioningserver/drivers/power/ipmi.py ++++ b/src/provisioningserver/drivers/power/ipmi.py +@@ -143,6 +143,12 @@ IPMI_ERRORS = { + " MAAS performed several retries. Please wait and try again."), + 'exception': PowerConnError + }, ++ 'BMC error': { ++ 'message': ( ++ "Device not responding correctly while performing power action." ++ " MAAS performed several retries. Please wait and try again."), ++ 'exception': PowerConnError ++ }, + 'could not find inband device': { + 'message': ( + "An inband device could not be found." +@@ -209,7 +215,7 @@ class IPMIPowerDriver(PowerDriver): + 'mac_address', "Power MAC", scope=SETTING_SCOPE.NODE) + ] + ip_extractor = make_ip_extractor('power_address') +- wait_time = (4, 8, 16, 32) ++ wait_time = (4, 4, 8, 8, 16, 16, 32, 32) + + def detect_missing_packages(self): + if not shell.has_command_available('ipmipower'): diff --git a/images/maas-rack-controller/Dockerfile b/images/maas-rack-controller/Dockerfile index 7dca08b..16d14d0 100644 --- a/images/maas-rack-controller/Dockerfile +++ b/images/maas-rack-controller/Dockerfile @@ -56,11 +56,14 @@ COPY 2.3_mac_address.patch /tmp/2.3_mac_address.patch # sh8121att: patch so query for RPC info contains proper Host header copy 2.3_hostheader.patch /tmp/2.3_hostheader.patch COPY 2.3_secure_headers.patch /tmp/2.3_secure_headers.patch +# Patch so maas knows that "BMC error" is retriable +COPY 2.3_ipmi_error.patch /tmp/2.3_ipmi_error.patch RUN cd /usr/lib/python3/dist-packages/provisioningserver/utils && patch network.py < /tmp/2.3_nic_filter.patch RUN cd /usr/lib/python3/dist-packages/provisioningserver/utils && patch ipaddr.py < /tmp/2.3_mac_address.patch RUN cd /usr/lib/python3/dist-packages/provisioningserver/rpc && patch clusterservice.py < /tmp/2.3_hostheader.patch RUN cd /usr/lib/python3/dist-packages/twisted/web && patch server.py < /tmp/2.3_secure_headers.patch +RUN cd /usr/lib/python3/dist-packages/provisioningserver/drivers/power && patch ipmi.py < /tmp/2.3_ipmi_error.patch # echo journalctl logs to the container's stdout COPY scripts/journalctl-to-tty.service /etc/systemd/system/journalctl-to-tty.service