Added code to support Destroy Node

Aded API capability to erase the storage and destroy one
or more nodes from any node state.
After this operation, the destroyed nodes are deleted from maas
database, and will need to be re-enlisted and be added to maas
in New state. if they need to be re-deployed.

Change-Id: I986278902965d93514f8198602f971669e1e2db3
This commit is contained in:
Ahmad Mahmoudi 2018-07-26 19:07:23 -05:00
parent 4423ee985e
commit 471bc6147e
4 changed files with 299 additions and 11 deletions

View File

@ -46,6 +46,7 @@ class DrydockConfig(object):
options = [
cfg.IntOpt(
'poll_interval',
min=1,
default=10,
help=
'Polling interval in seconds for checking subtask or downstream status'
@ -190,6 +191,11 @@ class DrydockConfig(object):
help=
'Timeout in minutes between deployment completion and the all boot actions reporting status'
),
cfg.IntOpt(
'destroy_node',
default=30,
help='Timeout in minutes for releasing a node',
),
]
def __init__(self):

View File

@ -199,9 +199,168 @@ class InterrogateNode(BaseMaasAction):
class DestroyNode(BaseMaasAction):
"""Action to remove node from MaaS in preparation for redeploy."""
# define the list of node statuses, from which maas server allows releasing a node
# A machine can be released from following states, based on MaaS API reference.
# The disk of the released machine is erased, and the machine will end up in
# "Ready" state in MaaS after release.
actionable_node_statuses = (
"Allocated",
"Deployed",
"Deploying",
"Failed deployment",
"Releasing failed",
"Failed disk erasing",
)
def start(self):
"""
Destroy Node erases the storage, releases the BM node in MaaS, and
finally deletes the BM node as a resource from the MaaS database.
After successful completion of this action, the destroyed nodes are removed
from MaaS list of resources and will be Unkown to MaaS. These nodes have
to go through the enlistment process and be detected by MaaS as new nodes.
Destroy Node can be performed from any BM node state.
:return: None
"""
try:
machine_list = maas_machine.Machines(self.maas_client)
machine_list.refresh()
except Exception as ex:
self.logger.warning("Error accessing the MaaS API.", exc_info=ex)
self.task.set_status(hd_fields.TaskStatus.Complete)
self.task.failure()
self.task.add_status_msg(
msg='Error accessing MaaS Machines API: {}'.format(str(ex)),
error=True,
ctx='NA',
ctx_type='NA')
self.task.save()
return
self.task.set_status(hd_fields.TaskStatus.Running)
self.task.save()
try:
site_design = self._load_site_design()
except errors.OrchestratorError:
self.task.add_status_msg(
msg="Error loading site design.",
error=True,
ctx='NA',
ctx_type='NA')
self.task.set_status(hd_fields.TaskStatus.Complete)
self.task.failure()
self.task.save()
return
nodes = self.orchestrator.process_node_filter(self.task.node_filter,
site_design)
for n in nodes:
try:
machine = machine_list.identify_baremetal_node(n, update_name=False)
if machine is None:
msg = "Could not locate machine for node {}".format(n.name)
self.logger.info(msg)
self.task.add_status_msg(
msg=msg, error=False, ctx=n.name, ctx_type='node')
self.task.success(focus=n.get_id())
continue
# First release the node and erase its disks, if MaaS API allows
if machine.status_name in self.actionable_node_statuses:
msg = "Releasing node {}, and erasing storage.".format(
n.name)
self.logger.info(msg)
try:
machine.release(erase_disk=True, quick_erase=True)
except errors.DriverError:
msg = "Error Releasing node {}, skipping".format(n.name)
self.logger.warning(msg)
self.task.add_status_msg(
msg=msg, error=True, ctx=n.name, ctx_type='node')
self.task.failure(focus=n.get_id())
continue
# node release with erase disk will take sometime monitor it
attempts = 0
max_attempts = (config.config_mgr.conf.timeouts.destroy_node
* 60) // config.config_mgr.conf.maasdriver.poll_interval
while (attempts < max_attempts
and (not machine.status_name.startswith('Ready')
and not machine.status_name.startswith(
'Failed'))):
attempts = attempts + 1
time.sleep(
config.config_mgr.conf.maasdriver.poll_interval)
try:
machine.refresh()
self.logger.debug(
"Polling node {} status attempt {:d} of {:d}: {}".format(
n.name, attempts, max_attempts,
machine.status_name))
except Exception:
self.logger.warning(
"Error updating node {} status during release node, will re-attempt.".format(n.name))
if machine.status_name.startswith('Ready'):
msg = "Node {} released and disk erased.".format(
n.name)
self.logger.info(msg)
self.task.add_status_msg(
msg=msg, error=False, ctx=n.name, ctx_type='node')
self.task.success(focus=n.get_id())
else:
msg = "Node {} release timed out".format(n.name)
self.logger.warning(msg)
self.task.add_status_msg(
msg=msg, error=True, ctx=n.name, ctx_type='node')
self.task.failure(focus=n.get_id())
else:
# Node is in a state that cannot be released from MaaS API.
# Reset the storage instead
msg = "Destroy node {} in status: {}, resetting storage.".format(
n.name, machine.status_name)
self.logger.info(msg)
machine.reset_storage_config()
self.task.add_status_msg(
msg=msg, error=False, ctx=n.name, ctx_type='node')
# for both cases above delete the node to force re-commissioning
# But, before deleting the node reset it power type in maas if
# the node power type should be virsh.
try:
if n.oob_type == 'libvirt':
self.logger.info(
'Resetting MaaS virsh power parameters for node {}.'.format(
n.name))
# setting power type attibutes to empty string
# will remove them from maas BMC table
machine.reset_power_parameters()
except AttributeError as attr_er:
pass
machine.delete()
msg = "Deleted Node: {} in status: {}.".format(n.name,
machine.status_name)
self.logger.info(msg)
self.task.add_status_msg(
msg=msg, error=False, ctx=n.name, ctx_type='node')
self.task.success(focus=n.get_id())
except errors.DriverError as dex:
msg = "Driver error, while destroying node {}, skipping".format(
n.name)
self.logger.warning(msg, exc_info=dex)
self.task.add_status_msg(
msg=msg, error=True, ctx=n.name, ctx_type='node')
self.task.failure(focus=n.get_id())
continue
self.task.set_status(hd_fields.TaskStatus.Complete)
self.task.failure()
self.task.save()
return
@ -949,9 +1108,8 @@ class ConfigureHardware(BaseMaasAction):
# Poll machine status
attempts = 0
max_attempts = config.config_mgr.conf.timeouts.configure_hardware * (
60 //
config.config_mgr.conf.maasdriver.poll_interval)
max_attempts = (config.config_mgr.conf.timeouts.configure_hardware
* 60) // config.config_mgr.conf.maasdriver.poll_interval
while (attempts < max_attempts and
(machine.status_name != 'Ready' and
@ -2096,8 +2254,8 @@ class DeployNode(BaseMaasAction):
continue
attempts = 0
max_attempts = config.config_mgr.conf.timeouts.deploy_node * (
60 // config.config_mgr.conf.maasdriver.poll_interval)
max_attempts = (config.config_mgr.conf.timeouts.deploy_node
* 60) // config.config_mgr.conf.maasdriver.poll_interval
while (attempts < max_attempts
and (not machine.status_name.startswith('Deployed')

View File

@ -206,14 +206,39 @@ class Machine(model_base.ResourceBase):
self.logger.error(msg)
raise errors.DriverError(msg)
def release(self, erase_disk=False):
def release(self, erase_disk=False, secure_erase=False, quick_erase=False):
"""Release a node so it can be redeployed.
Release is opposite of acquire/allocate. After a successful release, the node
will be in Ready state.
:param erase_disk: If true, the local disks on the machine will be quick wiped
:param erase_disk: If true, the local disks on the machine will be erased.
:param secure_erase: If erase_disk and secure_erase are set to True, and
quick_erase is not specified (default to False), MaaS
will try secure_erase first. If the drive does not
support secure erase, MaaS will overwirte th entire
drive with null butes.
:param quick_erase: If erase_disk and quick_erase are true, 1MB at the
start and at the end of the drive will be erased to make
data recovery inconvenient.
If all three parameters are True and the drive supports
secure erase, secure_erase will have precedence.
If the all three parameters are true, but the disk drive
does not support secure erase, MaaS will do quick erase.
But, if the disk drive supports neither secure nor
quick erase, the disk will be re-written with null bytes.
If erase_disk is true, but both secure_erase and quick_erase
are Fasle (default), MAAS will overwrite the whole disk
with null bytes.
If erase_disk is false, MaaS will not erase the drive, before
releasing the node.
"""
url = self.interpolate_url()
options = {'erase': erase_disk}
options = {
'erase': erase_disk,
'secure_erase': secure_erase,
'quick_erase': quick_erase,
}
resp = self.api_client.post(url, op='release', files=options)
@ -224,6 +249,26 @@ class Machine(model_base.ResourceBase):
self.logger.debug("MaaS response: %s" % resp.text)
raise errors.DriverError(brief_msg)
def delete(self):
""" Reset the node storage, and delete it.
After node deletion, the node resource is purged from MaaS resources.
MaaS API machine delete call, only removes the machine from MaaS resource list.
AFter delete, he namchine needs to be manually pwowered on to be re-enlisted
in MaaS as a New node.
:param erase_disk: If true, the node storage is reset, before node resource
is deleted from maas.
"""
url = self.interpolate_url()
resp = self.api_client.delete(url)
if not resp.ok:
brief_msg = ("Error deleting node, received HTTP %s from MaaS" %
resp.status_code)
self.logger.error(brief_msg)
self.logger.debug("MaaS response: %s" % resp.text)
raise errors.DriverError(brief_msg)
def commission(self, debug=False):
"""Start the MaaS commissioning process.
@ -355,6 +400,29 @@ class Machine(model_base.ResourceBase):
"Failed updating power parameters MAAS url %s - return code %s\n%s"
% (url, resp.status_code.resp.text))
def reset_power_parameters(self):
"""Reset power type and parameters for this node to manual.
This is done to address the MaaS api issue detecting multiple BMC NIC
after a node delete.
Only available after the node has been added to MAAS.
"""
url = self.interpolate_url()
self.logger.debug("Resetting node power type for machine {}".format(
self.resource_id))
self.power_type = 'manual'
power_params = {'power_type': 'manual'}
resp = self.api_client.put(url, files=power_params)
if resp.status_code == 200:
return True
raise errors.DriverError(
"Failed updating power parameters MAAS url {} - return code {}\n{}".format(
url, resp.status_code.resp.text))
def to_dict(self):
"""Serialize this resource instance into a dict.

View File

@ -202,10 +202,66 @@ class DestroyNodes(BaseAction):
def start(self):
"""Start executing this action."""
self.task.set_status(hd_fields.TaskStatus.Complete)
self.task.failure()
self.task.set_status(hd_fields.TaskStatus.Running)
self.task.save()
node_driver = self._get_driver('node')
if node_driver is None:
self.task.set_status(hd_fields.TaskStatus.Complete)
self.task.add_status_msg(
msg="No node driver enabled, ending task.",
error=True,
ctx=str(self.task.get_id()),
ctx_type='task')
self.task.result.set_message("No NodeDriver enabled.")
self.task.result.set_reason("Bad Configuration.")
self.task.failure()
self.task.save()
return
target_nodes = self.orchestrator.get_target_nodes(self.task)
if not target_nodes:
self.task.add_status_msg(
msg="No nodes in scope, no work to to do.",
error=False,
ctx='NA',
ctx_type='NA')
self.task.success()
self.task.set_status(hd_fields.TaskStatus.Complete)
self.task.save()
return
node_release_task = None
while True:
if node_release_task is None:
node_release_task = self.orchestrator.create_task(
design_ref=self.task.design_ref,
action=hd_fields.OrchestratorAction.DestroyNode,
node_filter=self.task.node_filter)
self.task.register_subtask(node_release_task)
self.logger.info(
"Starting node driver task %s to Release nodes." %
(node_release_task.get_id()))
node_driver.execute_task(node_release_task.get_id())
node_release_task = self.state_manager.get_task(
node_release_task.get_id())
try:
if not node_release_task.retry_task(max_attempts=3):
break
except errors.MaxRetriesReached:
self.task.failure()
break
self.task.set_status(hd_fields.TaskStatus.Complete)
self.task.bubble_results(
action_filter=hd_fields.OrchestratorAction.DestroyNode)
self.task.align_result()
self.task.save()
return