drydock/drydock_provisioner/drivers/oob/pyghmi_driver/__init__.py

446 lines
17 KiB
Python

# Copyright 2017 AT&T Intellectual Property. All other rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import logging
from oslo_config import cfg
from pyghmi.ipmi.command import Command
from pyghmi.exceptions import IpmiException
import drydock_provisioner.error as errors
import drydock_provisioner.objects.fields as hd_fields
import drydock_provisioner.objects.task as task_model
import drydock_provisioner.drivers.oob as oob
import drydock_provisioner.drivers as drivers
class PyghmiDriver(oob.OobDriver):
pyghmi_driver_options = [
cfg.IntOpt(
'poll_interval',
default=10,
help='Polling interval in seconds for querying IPMI status'),
]
oob_types_supported = ['ipmi']
driver_name = "pyghmi_driver"
driver_key = "pyghmi_driver"
driver_desc = "Pyghmi OOB Driver"
oob_types_supported = ['ipmi']
def __init__(self, **kwargs):
super(PyghmiDriver, self).__init__(**kwargs)
cfg.CONF.register_opts(
PyghmiDriver.pyghmi_driver_options, group=PyghmiDriver.driver_key)
self.logger = logging.getLogger(cfg.CONF.logging.oobdriver_logger_name)
def execute_task(self, task_id):
task = self.state_manager.get_task(task_id)
if task is None:
self.logger.error("Invalid task %s" % (task_id))
raise errors.DriverError("Invalid task %s" % (task_id))
if task.action not in self.supported_actions:
self.logger.error("Driver %s doesn't support task action %s" %
(self.driver_desc, task.action))
raise errors.DriverError(
"Driver %s doesn't support task action %s" % (self.driver_desc,
task.action))
design_id = getattr(task, 'design_id', None)
if design_id is None:
raise errors.DriverError("No design ID specified in task %s" %
(task_id))
self.orchestrator.task_field_update(
task.get_id(), status=hd_fields.TaskStatus.Running)
if task.action == hd_fields.OrchestratorAction.ValidateOobServices:
self.orchestrator.task_field_update(
task.get_id(),
status=hd_fields.TaskStatus.Complete,
result=hd_fields.ActionResult.Success)
return
site_design = self.orchestrator.get_effective_site(design_id)
target_nodes = []
if len(task.node_list) > 0:
target_nodes.extend([
x for x in site_design.baremetal_nodes
if x.get_name() in task.node_list
])
else:
target_nodes.extend(site_design.baremetal_nodes)
incomplete_subtasks = []
# For each target node, create a subtask and kick off a runner
for n in target_nodes:
subtask = self.orchestrator.create_task(
task_model.DriverTask,
parent_task_id=task.get_id(),
design_id=design_id,
action=task.action,
task_scope={'node_names': [n.get_name()]})
incomplete_subtasks.append(subtask.get_id())
runner = PyghmiTaskRunner(
state_manager=self.state_manager,
orchestrator=self.orchestrator,
task_id=subtask.get_id(),
node=n)
runner.start()
attempts = 0
max_attempts = getattr(cfg.CONF.timeouts, task.action,
cfg.CONF.timeouts.drydock_timeout) * (
60 / cfg.CONF.pyghmi_driver.poll_interval)
while (len(incomplete_subtasks) > 0 and attempts <= max_attempts):
for n in incomplete_subtasks:
t = self.state_manager.get_task(n)
if t.get_status() in [
hd_fields.TaskStatus.Terminated,
hd_fields.TaskStatus.Complete,
hd_fields.TaskStatus.Errored
]:
incomplete_subtasks.remove(n)
time.sleep(cfg.CONF.pyghmi_driver.poll_interval)
attempts = attempts + 1
task = self.state_manager.get_task(task.get_id())
subtasks = map(self.state_manager.get_task, task.get_subtasks())
success_subtasks = [
x for x in subtasks
if x.get_result() == hd_fields.ActionResult.Success
]
nosuccess_subtasks = [
x for x in subtasks
if x.get_result() in [
hd_fields.ActionResult.PartialSuccess,
hd_fields.ActionResult.Failure
]
]
task_result = None
if len(success_subtasks) > 0 and len(nosuccess_subtasks) > 0:
task_result = hd_fields.ActionResult.PartialSuccess
elif len(success_subtasks) == 0 and len(nosuccess_subtasks) > 0:
task_result = hd_fields.ActionResult.Failure
elif len(success_subtasks) > 0 and len(nosuccess_subtasks) == 0:
task_result = hd_fields.ActionResult.Success
else:
task_result = hd_fields.ActionResult.Incomplete
self.orchestrator.task_field_update(
task.get_id(),
result=task_result,
status=hd_fields.TaskStatus.Complete)
return
class PyghmiTaskRunner(drivers.DriverTaskRunner):
def __init__(self, node=None, **kwargs):
super(PyghmiTaskRunner, self).__init__(**kwargs)
self.logger = logging.getLogger('drydock.oobdriver.pyghmi')
# We cheat here by providing the Node model instead
# of making the runner source it from statemgmt
if node is None:
self.logger.error("Did not specify target node")
raise errors.DriverError("Did not specify target node")
self.node = node
def execute_task(self):
task_action = self.task.action
if len(self.task.node_list) != 1:
self.orchestrator.task_field_update(
self.task.get_id(),
result=hd_fields.ActionResult.Incomplete,
status=hd_fields.TaskStatus.Errored)
raise errors.DriverError(
"Multiple names (%s) in task %s node_list" %
(len(self.task.node_list), self.task.get_id()))
target_node_name = self.task.node_list[0]
if self.node.get_name() != target_node_name:
self.orchestrator.task_field_update(
self.task.get_id(),
result=hd_fields.ActionResult.Incomplete,
status=hd_fields.TaskStatus.Errored)
raise errors.DriverError("Runner node does not match " \
"task node scope")
self.orchestrator.task_field_update(
self.task.get_id(), status=hd_fields.TaskStatus.Running)
if task_action == hd_fields.OrchestratorAction.ConfigNodePxe:
self.orchestrator.task_field_update(
self.task.get_id(),
result=hd_fields.ActionResult.Failure,
status=hd_fields.TaskStatus.Complete)
return
elif task_action == hd_fields.OrchestratorAction.SetNodeBoot:
worked = False
self.logger.debug("Setting bootdev to PXE for %s" % self.node.name)
self.exec_ipmi_command(Command.set_bootdev, 'pxe')
time.sleep(3)
bootdev = self.exec_ipmi_command(Command.get_bootdev)
if bootdev.get('bootdev', '') == 'network':
self.logger.debug(
"%s reports bootdev of network" % self.node.name)
self.orchestrator.task_field_update(
self.task.get_id(),
result=hd_fields.ActionResult.Success,
status=hd_fields.TaskStatus.Complete)
return
else:
self.logger.warning("%s reports bootdev of %s" %
(self.node.name,
bootdev.get('bootdev', None)))
worked = False
self.logger.error(
"Giving up on IPMI command to %s after 3 attempts" %
self.node.name)
self.orchestrator.task_field_update(
self.task.get_id(),
result=hd_fields.ActionResult.Failure,
status=hd_fields.TaskStatus.Complete)
return
elif task_action == hd_fields.OrchestratorAction.PowerOffNode:
worked = False
self.logger.debug(
"Sending set_power = off command to %s" % self.node.name)
self.exec_ipmi_command(Command.set_power, 'off')
i = 18
while i > 0:
self.logger.debug("Polling powerstate waiting for success.")
power_state = self.exec_ipmi_command(Command.get_power)
if power_state.get('powerstate', '') == 'off':
self.logger.debug("Node reports powerstate of off")
worked = True
break
time.sleep(10)
i = i - 1
if worked:
self.orchestrator.task_field_update(
self.task.get_id(),
result=hd_fields.ActionResult.Success,
status=hd_fields.TaskStatus.Complete)
else:
self.logger.error(
"Giving up on IPMI command to %s" % self.node.name)
self.orchestrator.task_field_update(
self.task.get_id(),
result=hd_fields.ActionResult.Failure,
status=hd_fields.TaskStatus.Complete)
return
elif task_action == hd_fields.OrchestratorAction.PowerOnNode:
worked = False
self.logger.debug(
"Sending set_power = off command to %s" % self.node.name)
self.exec_ipmi_command(Command.set_power, 'off')
i = 18
while i > 0:
self.logger.debug("Polling powerstate waiting for success.")
power_state = self.exec_ipmi_command(Command.get_power)
if power_state.get('powerstate', '') == 'off':
self.logger.debug("Node reports powerstate of off")
worked = True
break
time.sleep(10)
i = i - 1
if worked:
self.orchestrator.task_field_update(
self.task.get_id(),
result=hd_fields.ActionResult.Success,
status=hd_fields.TaskStatus.Complete)
else:
self.logger.error(
"Giving up on IPMI command to %s" % self.node.name)
self.orchestrator.task_field_update(
self.task.get_id(),
result=hd_fields.ActionResult.Failure,
status=hd_fields.TaskStatus.Complete)
return
elif task_action == hd_fields.OrchestratorAction.PowerCycleNode:
self.logger.debug(
"Sending set_power = off command to %s" % self.node.name)
self.exec_ipmi_command(Command.set_power, 'off')
# Wait for power state of off before booting back up
# We'll wait for up to 3 minutes to power off
i = 18
while i > 0:
power_state = self.exec_ipmi_command(Command.get_power)
if power_state is not None and power_state.get(
'powerstate', '') == 'off':
self.logger.debug(
"%s reports powerstate of off" % self.node.name)
break
elif power_state is None:
self.logger.debug("None response on IPMI power query to %s"
% self.node.name)
time.sleep(10)
i = i - 1
if power_state.get('powerstate', '') == 'on':
self.logger.warning(
"Failed powering down node %s during power cycle task" %
self.node.name)
self.orchestrator.task_field_update(
self.task.get_id(),
result=hd_fields.ActionResult.Failure,
status=hd_fields.TaskStatus.Complete)
return
self.logger.debug(
"Sending set_power = on command to %s" % self.node.name)
self.exec_ipmi_command(Command.set_power, 'on')
i = 18
while i > 0:
power_state = self.exec_ipmi_command(Command.get_power)
if power_state is not None and power_state.get(
'powerstate', '') == 'on':
self.logger.debug(
"%s reports powerstate of on" % self.node.name)
break
elif power_state is None:
self.logger.debug("None response on IPMI power query to %s"
% self.node.name)
time.sleep(10)
i = i - 1
if power_state.get('powerstate', '') == 'on':
self.orchestrator.task_field_update(
self.task.get_id(),
result=hd_fields.ActionResult.Success,
status=hd_fields.TaskStatus.Complete)
else:
self.logger.warning(
"Failed powering up node %s during power cycle task" %
self.node.name)
self.orchestrator.task_field_update(
self.task.get_id(),
result=hd_fields.ActionResult.Failure,
status=hd_fields.TaskStatus.Complete)
return
elif task_action == hd_fields.OrchestratorAction.InterrogateOob:
mci_id = self.exec_ipmi_command(Command.get_mci)
self.orchestrator.task_field_update(
self.task.get_id(),
result=hd_fields.ActionResult.Success,
status=hd_fields.TaskStatus.Complete,
result_detail=mci_id)
return
def get_ipmi_session(self):
"""
Initialize a Pyghmi IPMI session to this runner's self.node
:return: An instance of pyghmi.ipmi.command.Command initialized to nodes' IPMI interface
"""
node = self.node
if node.oob_type != 'ipmi':
raise errors.DriverError("Node OOB type is not IPMI")
ipmi_network = self.node.oob_parameters['network']
ipmi_address = self.node.get_network_address(ipmi_network)
if ipmi_address is None:
raise errors.DriverError("Node %s has no IPMI address" %
(node.name))
ipmi_account = self.node.oob_parameters['account']
ipmi_credential = self.node.oob_parameters['credential']
self.logger.debug("Starting IPMI session to %s with %s/%s" %
(ipmi_address, ipmi_account, ipmi_credential[:1]))
ipmi_session = Command(
bmc=ipmi_address, userid=ipmi_account, password=ipmi_credential)
return ipmi_session
def exec_ipmi_command(self, callable, *args):
"""
Call an IPMI command after establishing a session with this runner's node
:param callable: The pyghmi Command method to call
:param args: The args to pass the callable
"""
attempts = 0
while attempts < 5:
try:
self.logger.debug("Initializing IPMI session")
ipmi_session = self.get_ipmi_session()
except IpmiException as iex:
self.logger.error("Error initializing IPMI session for node %s"
% self.node.name)
self.logger.debug("IPMI Exception: %s" % str(iex))
self.logger.warning(
"IPMI command failed, retrying after 15 seconds...")
time.sleep(15)
attempts = attempts + 1
continue
try:
self.logger.debug("Calling IPMI command %s on %s" %
(callable.__name__, self.node.name))
response = callable(ipmi_session, *args)
ipmi_session.ipmi_session.logout()
return response
except IpmiException as iex:
self.logger.error("Error sending command: %s" % str(iex))
self.logger.warning(
"IPMI command failed, retrying after 15 seconds...")
time.sleep(15)
attempts = attempts + 1
def list_opts():
return {PyghmiDriver.driver_key: PyghmiDriver.pyghmi_driver_options}