From 471bc6147e43911205d5029447e91c58a8e5d8c2 Mon Sep 17 00:00:00 2001 From: Ahmad Mahmoudi Date: Thu, 26 Jul 2018 19:07:23 -0500 Subject: [PATCH] Added code to support Destroy Node Aded API capability to erase the storage and destroy one or more nodes from any node state. After this operation, the destroyed nodes are deleted from maas database, and will need to be re-enlisted and be added to maas in New state. if they need to be re-deployed. Change-Id: I986278902965d93514f8198602f971669e1e2db3 --- drydock_provisioner/config.py | 6 + .../drivers/node/maasdriver/actions/node.py | 170 +++++++++++++++++- .../drivers/node/maasdriver/models/machine.py | 74 +++++++- .../orchestrator/actions/orchestrator.py | 60 ++++++- 4 files changed, 299 insertions(+), 11 deletions(-) diff --git a/drydock_provisioner/config.py b/drydock_provisioner/config.py index 0466cfbc..f7b979e0 100644 --- a/drydock_provisioner/config.py +++ b/drydock_provisioner/config.py @@ -46,6 +46,7 @@ class DrydockConfig(object): options = [ cfg.IntOpt( 'poll_interval', + min=1, default=10, help= 'Polling interval in seconds for checking subtask or downstream status' @@ -190,6 +191,11 @@ class DrydockConfig(object): help= 'Timeout in minutes between deployment completion and the all boot actions reporting status' ), + cfg.IntOpt( + 'destroy_node', + default=30, + help='Timeout in minutes for releasing a node', + ), ] def __init__(self): diff --git a/drydock_provisioner/drivers/node/maasdriver/actions/node.py b/drydock_provisioner/drivers/node/maasdriver/actions/node.py index de72d8ee..1dcd64c8 100644 --- a/drydock_provisioner/drivers/node/maasdriver/actions/node.py +++ b/drydock_provisioner/drivers/node/maasdriver/actions/node.py @@ -199,9 +199,168 @@ class InterrogateNode(BaseMaasAction): class DestroyNode(BaseMaasAction): """Action to remove node from MaaS in preparation for redeploy.""" + # define the list of node statuses, from which maas server allows releasing a node + + # A machine can be released from following states, based on MaaS API reference. + # The disk of the released machine is erased, and the machine will end up in + # "Ready" state in MaaS after release. + actionable_node_statuses = ( + "Allocated", + "Deployed", + "Deploying", + "Failed deployment", + "Releasing failed", + "Failed disk erasing", + ) + def start(self): + """ + Destroy Node erases the storage, releases the BM node in MaaS, and + finally deletes the BM node as a resource from the MaaS database. + After successful completion of this action, the destroyed nodes are removed + from MaaS list of resources and will be Unkown to MaaS. These nodes have + to go through the enlistment process and be detected by MaaS as new nodes. + Destroy Node can be performed from any BM node state. + + :return: None + """ + try: + machine_list = maas_machine.Machines(self.maas_client) + machine_list.refresh() + except Exception as ex: + self.logger.warning("Error accessing the MaaS API.", exc_info=ex) + self.task.set_status(hd_fields.TaskStatus.Complete) + self.task.failure() + self.task.add_status_msg( + msg='Error accessing MaaS Machines API: {}'.format(str(ex)), + error=True, + ctx='NA', + ctx_type='NA') + self.task.save() + return + + self.task.set_status(hd_fields.TaskStatus.Running) + self.task.save() + + try: + site_design = self._load_site_design() + except errors.OrchestratorError: + self.task.add_status_msg( + msg="Error loading site design.", + error=True, + ctx='NA', + ctx_type='NA') + self.task.set_status(hd_fields.TaskStatus.Complete) + self.task.failure() + self.task.save() + return + + nodes = self.orchestrator.process_node_filter(self.task.node_filter, + site_design) + for n in nodes: + try: + machine = machine_list.identify_baremetal_node(n, update_name=False) + + if machine is None: + msg = "Could not locate machine for node {}".format(n.name) + self.logger.info(msg) + self.task.add_status_msg( + msg=msg, error=False, ctx=n.name, ctx_type='node') + self.task.success(focus=n.get_id()) + continue + + # First release the node and erase its disks, if MaaS API allows + if machine.status_name in self.actionable_node_statuses: + msg = "Releasing node {}, and erasing storage.".format( + n.name) + self.logger.info(msg) + + try: + machine.release(erase_disk=True, quick_erase=True) + except errors.DriverError: + msg = "Error Releasing node {}, skipping".format(n.name) + self.logger.warning(msg) + self.task.add_status_msg( + msg=msg, error=True, ctx=n.name, ctx_type='node') + self.task.failure(focus=n.get_id()) + continue + + # node release with erase disk will take sometime monitor it + attempts = 0 + max_attempts = (config.config_mgr.conf.timeouts.destroy_node + * 60) // config.config_mgr.conf.maasdriver.poll_interval + + while (attempts < max_attempts + and (not machine.status_name.startswith('Ready') + and not machine.status_name.startswith( + 'Failed'))): + attempts = attempts + 1 + time.sleep( + config.config_mgr.conf.maasdriver.poll_interval) + try: + machine.refresh() + self.logger.debug( + "Polling node {} status attempt {:d} of {:d}: {}".format( + n.name, attempts, max_attempts, + machine.status_name)) + except Exception: + self.logger.warning( + "Error updating node {} status during release node, will re-attempt.".format(n.name)) + if machine.status_name.startswith('Ready'): + msg = "Node {} released and disk erased.".format( + n.name) + self.logger.info(msg) + self.task.add_status_msg( + msg=msg, error=False, ctx=n.name, ctx_type='node') + self.task.success(focus=n.get_id()) + else: + msg = "Node {} release timed out".format(n.name) + self.logger.warning(msg) + self.task.add_status_msg( + msg=msg, error=True, ctx=n.name, ctx_type='node') + self.task.failure(focus=n.get_id()) + else: + # Node is in a state that cannot be released from MaaS API. + # Reset the storage instead + msg = "Destroy node {} in status: {}, resetting storage.".format( + n.name, machine.status_name) + self.logger.info(msg) + machine.reset_storage_config() + self.task.add_status_msg( + msg=msg, error=False, ctx=n.name, ctx_type='node') + + # for both cases above delete the node to force re-commissioning + # But, before deleting the node reset it power type in maas if + # the node power type should be virsh. + try: + if n.oob_type == 'libvirt': + self.logger.info( + 'Resetting MaaS virsh power parameters for node {}.'.format( + n.name)) + # setting power type attibutes to empty string + # will remove them from maas BMC table + machine.reset_power_parameters() + except AttributeError as attr_er: + pass + + machine.delete() + msg = "Deleted Node: {} in status: {}.".format(n.name, + machine.status_name) + self.logger.info(msg) + self.task.add_status_msg( + msg=msg, error=False, ctx=n.name, ctx_type='node') + self.task.success(focus=n.get_id()) + + except errors.DriverError as dex: + msg = "Driver error, while destroying node {}, skipping".format( + n.name) + self.logger.warning(msg, exc_info=dex) + self.task.add_status_msg( + msg=msg, error=True, ctx=n.name, ctx_type='node') + self.task.failure(focus=n.get_id()) + continue + self.task.set_status(hd_fields.TaskStatus.Complete) - self.task.failure() self.task.save() return @@ -949,9 +1108,8 @@ class ConfigureHardware(BaseMaasAction): # Poll machine status attempts = 0 - max_attempts = config.config_mgr.conf.timeouts.configure_hardware * ( - 60 // - config.config_mgr.conf.maasdriver.poll_interval) + max_attempts = (config.config_mgr.conf.timeouts.configure_hardware + * 60) // config.config_mgr.conf.maasdriver.poll_interval while (attempts < max_attempts and (machine.status_name != 'Ready' and @@ -2096,8 +2254,8 @@ class DeployNode(BaseMaasAction): continue attempts = 0 - max_attempts = config.config_mgr.conf.timeouts.deploy_node * ( - 60 // config.config_mgr.conf.maasdriver.poll_interval) + max_attempts = (config.config_mgr.conf.timeouts.deploy_node + * 60) // config.config_mgr.conf.maasdriver.poll_interval while (attempts < max_attempts and (not machine.status_name.startswith('Deployed') diff --git a/drydock_provisioner/drivers/node/maasdriver/models/machine.py b/drydock_provisioner/drivers/node/maasdriver/models/machine.py index 2e0a5eb1..0aa880de 100644 --- a/drydock_provisioner/drivers/node/maasdriver/models/machine.py +++ b/drydock_provisioner/drivers/node/maasdriver/models/machine.py @@ -206,14 +206,39 @@ class Machine(model_base.ResourceBase): self.logger.error(msg) raise errors.DriverError(msg) - def release(self, erase_disk=False): + def release(self, erase_disk=False, secure_erase=False, quick_erase=False): """Release a node so it can be redeployed. + Release is opposite of acquire/allocate. After a successful release, the node + will be in Ready state. - :param erase_disk: If true, the local disks on the machine will be quick wiped + :param erase_disk: If true, the local disks on the machine will be erased. + :param secure_erase: If erase_disk and secure_erase are set to True, and + quick_erase is not specified (default to False), MaaS + will try secure_erase first. If the drive does not + support secure erase, MaaS will overwirte th entire + drive with null butes. + :param quick_erase: If erase_disk and quick_erase are true, 1MB at the + start and at the end of the drive will be erased to make + data recovery inconvenient. + If all three parameters are True and the drive supports + secure erase, secure_erase will have precedence. + If the all three parameters are true, but the disk drive + does not support secure erase, MaaS will do quick erase. + But, if the disk drive supports neither secure nor + quick erase, the disk will be re-written with null bytes. + If erase_disk is true, but both secure_erase and quick_erase + are Fasle (default), MAAS will overwrite the whole disk + with null bytes. + If erase_disk is false, MaaS will not erase the drive, before + releasing the node. """ url = self.interpolate_url() - options = {'erase': erase_disk} + options = { + 'erase': erase_disk, + 'secure_erase': secure_erase, + 'quick_erase': quick_erase, + } resp = self.api_client.post(url, op='release', files=options) @@ -224,6 +249,26 @@ class Machine(model_base.ResourceBase): self.logger.debug("MaaS response: %s" % resp.text) raise errors.DriverError(brief_msg) + def delete(self): + """ Reset the node storage, and delete it. + After node deletion, the node resource is purged from MaaS resources. + MaaS API machine delete call, only removes the machine from MaaS resource list. + AFter delete, he namchine needs to be manually pwowered on to be re-enlisted + in MaaS as a New node. + + :param erase_disk: If true, the node storage is reset, before node resource + is deleted from maas. + """ + url = self.interpolate_url() + resp = self.api_client.delete(url) + + if not resp.ok: + brief_msg = ("Error deleting node, received HTTP %s from MaaS" % + resp.status_code) + self.logger.error(brief_msg) + self.logger.debug("MaaS response: %s" % resp.text) + raise errors.DriverError(brief_msg) + def commission(self, debug=False): """Start the MaaS commissioning process. @@ -355,6 +400,29 @@ class Machine(model_base.ResourceBase): "Failed updating power parameters MAAS url %s - return code %s\n%s" % (url, resp.status_code.resp.text)) + def reset_power_parameters(self): + """Reset power type and parameters for this node to manual. + This is done to address the MaaS api issue detecting multiple BMC NIC + after a node delete. + + Only available after the node has been added to MAAS. + """ + + url = self.interpolate_url() + + self.logger.debug("Resetting node power type for machine {}".format( + self.resource_id)) + self.power_type = 'manual' + power_params = {'power_type': 'manual'} + resp = self.api_client.put(url, files=power_params) + + if resp.status_code == 200: + return True + + raise errors.DriverError( + "Failed updating power parameters MAAS url {} - return code {}\n{}".format( + url, resp.status_code.resp.text)) + def to_dict(self): """Serialize this resource instance into a dict. diff --git a/drydock_provisioner/orchestrator/actions/orchestrator.py b/drydock_provisioner/orchestrator/actions/orchestrator.py index 3dddf2a9..44626967 100644 --- a/drydock_provisioner/orchestrator/actions/orchestrator.py +++ b/drydock_provisioner/orchestrator/actions/orchestrator.py @@ -202,10 +202,66 @@ class DestroyNodes(BaseAction): def start(self): """Start executing this action.""" - self.task.set_status(hd_fields.TaskStatus.Complete) - self.task.failure() + self.task.set_status(hd_fields.TaskStatus.Running) self.task.save() + node_driver = self._get_driver('node') + + if node_driver is None: + self.task.set_status(hd_fields.TaskStatus.Complete) + self.task.add_status_msg( + msg="No node driver enabled, ending task.", + error=True, + ctx=str(self.task.get_id()), + ctx_type='task') + self.task.result.set_message("No NodeDriver enabled.") + self.task.result.set_reason("Bad Configuration.") + self.task.failure() + self.task.save() + return + + target_nodes = self.orchestrator.get_target_nodes(self.task) + + if not target_nodes: + self.task.add_status_msg( + msg="No nodes in scope, no work to to do.", + error=False, + ctx='NA', + ctx_type='NA') + self.task.success() + self.task.set_status(hd_fields.TaskStatus.Complete) + self.task.save() + return + + node_release_task = None + while True: + if node_release_task is None: + node_release_task = self.orchestrator.create_task( + design_ref=self.task.design_ref, + action=hd_fields.OrchestratorAction.DestroyNode, + node_filter=self.task.node_filter) + self.task.register_subtask(node_release_task) + + self.logger.info( + "Starting node driver task %s to Release nodes." % + (node_release_task.get_id())) + node_driver.execute_task(node_release_task.get_id()) + + node_release_task = self.state_manager.get_task( + node_release_task.get_id()) + + try: + if not node_release_task.retry_task(max_attempts=3): + break + except errors.MaxRetriesReached: + self.task.failure() + break + + self.task.set_status(hd_fields.TaskStatus.Complete) + self.task.bubble_results( + action_filter=hd_fields.OrchestratorAction.DestroyNode) + self.task.align_result() + self.task.save() return