diff --git a/drydock_provisioner/config.py b/drydock_provisioner/config.py index 0466cfbc..f7b979e0 100644 --- a/drydock_provisioner/config.py +++ b/drydock_provisioner/config.py @@ -46,6 +46,7 @@ class DrydockConfig(object): options = [ cfg.IntOpt( 'poll_interval', + min=1, default=10, help= 'Polling interval in seconds for checking subtask or downstream status' @@ -190,6 +191,11 @@ class DrydockConfig(object): help= 'Timeout in minutes between deployment completion and the all boot actions reporting status' ), + cfg.IntOpt( + 'destroy_node', + default=30, + help='Timeout in minutes for releasing a node', + ), ] def __init__(self): diff --git a/drydock_provisioner/drivers/node/maasdriver/actions/node.py b/drydock_provisioner/drivers/node/maasdriver/actions/node.py index de72d8ee..1dcd64c8 100644 --- a/drydock_provisioner/drivers/node/maasdriver/actions/node.py +++ b/drydock_provisioner/drivers/node/maasdriver/actions/node.py @@ -199,9 +199,168 @@ class InterrogateNode(BaseMaasAction): class DestroyNode(BaseMaasAction): """Action to remove node from MaaS in preparation for redeploy.""" + # define the list of node statuses, from which maas server allows releasing a node + + # A machine can be released from following states, based on MaaS API reference. + # The disk of the released machine is erased, and the machine will end up in + # "Ready" state in MaaS after release. + actionable_node_statuses = ( + "Allocated", + "Deployed", + "Deploying", + "Failed deployment", + "Releasing failed", + "Failed disk erasing", + ) + def start(self): + """ + Destroy Node erases the storage, releases the BM node in MaaS, and + finally deletes the BM node as a resource from the MaaS database. + After successful completion of this action, the destroyed nodes are removed + from MaaS list of resources and will be Unkown to MaaS. These nodes have + to go through the enlistment process and be detected by MaaS as new nodes. + Destroy Node can be performed from any BM node state. + + :return: None + """ + try: + machine_list = maas_machine.Machines(self.maas_client) + machine_list.refresh() + except Exception as ex: + self.logger.warning("Error accessing the MaaS API.", exc_info=ex) + self.task.set_status(hd_fields.TaskStatus.Complete) + self.task.failure() + self.task.add_status_msg( + msg='Error accessing MaaS Machines API: {}'.format(str(ex)), + error=True, + ctx='NA', + ctx_type='NA') + self.task.save() + return + + self.task.set_status(hd_fields.TaskStatus.Running) + self.task.save() + + try: + site_design = self._load_site_design() + except errors.OrchestratorError: + self.task.add_status_msg( + msg="Error loading site design.", + error=True, + ctx='NA', + ctx_type='NA') + self.task.set_status(hd_fields.TaskStatus.Complete) + self.task.failure() + self.task.save() + return + + nodes = self.orchestrator.process_node_filter(self.task.node_filter, + site_design) + for n in nodes: + try: + machine = machine_list.identify_baremetal_node(n, update_name=False) + + if machine is None: + msg = "Could not locate machine for node {}".format(n.name) + self.logger.info(msg) + self.task.add_status_msg( + msg=msg, error=False, ctx=n.name, ctx_type='node') + self.task.success(focus=n.get_id()) + continue + + # First release the node and erase its disks, if MaaS API allows + if machine.status_name in self.actionable_node_statuses: + msg = "Releasing node {}, and erasing storage.".format( + n.name) + self.logger.info(msg) + + try: + machine.release(erase_disk=True, quick_erase=True) + except errors.DriverError: + msg = "Error Releasing node {}, skipping".format(n.name) + self.logger.warning(msg) + self.task.add_status_msg( + msg=msg, error=True, ctx=n.name, ctx_type='node') + self.task.failure(focus=n.get_id()) + continue + + # node release with erase disk will take sometime monitor it + attempts = 0 + max_attempts = (config.config_mgr.conf.timeouts.destroy_node + * 60) // config.config_mgr.conf.maasdriver.poll_interval + + while (attempts < max_attempts + and (not machine.status_name.startswith('Ready') + and not machine.status_name.startswith( + 'Failed'))): + attempts = attempts + 1 + time.sleep( + config.config_mgr.conf.maasdriver.poll_interval) + try: + machine.refresh() + self.logger.debug( + "Polling node {} status attempt {:d} of {:d}: {}".format( + n.name, attempts, max_attempts, + machine.status_name)) + except Exception: + self.logger.warning( + "Error updating node {} status during release node, will re-attempt.".format(n.name)) + if machine.status_name.startswith('Ready'): + msg = "Node {} released and disk erased.".format( + n.name) + self.logger.info(msg) + self.task.add_status_msg( + msg=msg, error=False, ctx=n.name, ctx_type='node') + self.task.success(focus=n.get_id()) + else: + msg = "Node {} release timed out".format(n.name) + self.logger.warning(msg) + self.task.add_status_msg( + msg=msg, error=True, ctx=n.name, ctx_type='node') + self.task.failure(focus=n.get_id()) + else: + # Node is in a state that cannot be released from MaaS API. + # Reset the storage instead + msg = "Destroy node {} in status: {}, resetting storage.".format( + n.name, machine.status_name) + self.logger.info(msg) + machine.reset_storage_config() + self.task.add_status_msg( + msg=msg, error=False, ctx=n.name, ctx_type='node') + + # for both cases above delete the node to force re-commissioning + # But, before deleting the node reset it power type in maas if + # the node power type should be virsh. + try: + if n.oob_type == 'libvirt': + self.logger.info( + 'Resetting MaaS virsh power parameters for node {}.'.format( + n.name)) + # setting power type attibutes to empty string + # will remove them from maas BMC table + machine.reset_power_parameters() + except AttributeError as attr_er: + pass + + machine.delete() + msg = "Deleted Node: {} in status: {}.".format(n.name, + machine.status_name) + self.logger.info(msg) + self.task.add_status_msg( + msg=msg, error=False, ctx=n.name, ctx_type='node') + self.task.success(focus=n.get_id()) + + except errors.DriverError as dex: + msg = "Driver error, while destroying node {}, skipping".format( + n.name) + self.logger.warning(msg, exc_info=dex) + self.task.add_status_msg( + msg=msg, error=True, ctx=n.name, ctx_type='node') + self.task.failure(focus=n.get_id()) + continue + self.task.set_status(hd_fields.TaskStatus.Complete) - self.task.failure() self.task.save() return @@ -949,9 +1108,8 @@ class ConfigureHardware(BaseMaasAction): # Poll machine status attempts = 0 - max_attempts = config.config_mgr.conf.timeouts.configure_hardware * ( - 60 // - config.config_mgr.conf.maasdriver.poll_interval) + max_attempts = (config.config_mgr.conf.timeouts.configure_hardware + * 60) // config.config_mgr.conf.maasdriver.poll_interval while (attempts < max_attempts and (machine.status_name != 'Ready' and @@ -2096,8 +2254,8 @@ class DeployNode(BaseMaasAction): continue attempts = 0 - max_attempts = config.config_mgr.conf.timeouts.deploy_node * ( - 60 // config.config_mgr.conf.maasdriver.poll_interval) + max_attempts = (config.config_mgr.conf.timeouts.deploy_node + * 60) // config.config_mgr.conf.maasdriver.poll_interval while (attempts < max_attempts and (not machine.status_name.startswith('Deployed') diff --git a/drydock_provisioner/drivers/node/maasdriver/models/machine.py b/drydock_provisioner/drivers/node/maasdriver/models/machine.py index 2e0a5eb1..0aa880de 100644 --- a/drydock_provisioner/drivers/node/maasdriver/models/machine.py +++ b/drydock_provisioner/drivers/node/maasdriver/models/machine.py @@ -206,14 +206,39 @@ class Machine(model_base.ResourceBase): self.logger.error(msg) raise errors.DriverError(msg) - def release(self, erase_disk=False): + def release(self, erase_disk=False, secure_erase=False, quick_erase=False): """Release a node so it can be redeployed. + Release is opposite of acquire/allocate. After a successful release, the node + will be in Ready state. - :param erase_disk: If true, the local disks on the machine will be quick wiped + :param erase_disk: If true, the local disks on the machine will be erased. + :param secure_erase: If erase_disk and secure_erase are set to True, and + quick_erase is not specified (default to False), MaaS + will try secure_erase first. If the drive does not + support secure erase, MaaS will overwirte th entire + drive with null butes. + :param quick_erase: If erase_disk and quick_erase are true, 1MB at the + start and at the end of the drive will be erased to make + data recovery inconvenient. + If all three parameters are True and the drive supports + secure erase, secure_erase will have precedence. + If the all three parameters are true, but the disk drive + does not support secure erase, MaaS will do quick erase. + But, if the disk drive supports neither secure nor + quick erase, the disk will be re-written with null bytes. + If erase_disk is true, but both secure_erase and quick_erase + are Fasle (default), MAAS will overwrite the whole disk + with null bytes. + If erase_disk is false, MaaS will not erase the drive, before + releasing the node. """ url = self.interpolate_url() - options = {'erase': erase_disk} + options = { + 'erase': erase_disk, + 'secure_erase': secure_erase, + 'quick_erase': quick_erase, + } resp = self.api_client.post(url, op='release', files=options) @@ -224,6 +249,26 @@ class Machine(model_base.ResourceBase): self.logger.debug("MaaS response: %s" % resp.text) raise errors.DriverError(brief_msg) + def delete(self): + """ Reset the node storage, and delete it. + After node deletion, the node resource is purged from MaaS resources. + MaaS API machine delete call, only removes the machine from MaaS resource list. + AFter delete, he namchine needs to be manually pwowered on to be re-enlisted + in MaaS as a New node. + + :param erase_disk: If true, the node storage is reset, before node resource + is deleted from maas. + """ + url = self.interpolate_url() + resp = self.api_client.delete(url) + + if not resp.ok: + brief_msg = ("Error deleting node, received HTTP %s from MaaS" % + resp.status_code) + self.logger.error(brief_msg) + self.logger.debug("MaaS response: %s" % resp.text) + raise errors.DriverError(brief_msg) + def commission(self, debug=False): """Start the MaaS commissioning process. @@ -355,6 +400,29 @@ class Machine(model_base.ResourceBase): "Failed updating power parameters MAAS url %s - return code %s\n%s" % (url, resp.status_code.resp.text)) + def reset_power_parameters(self): + """Reset power type and parameters for this node to manual. + This is done to address the MaaS api issue detecting multiple BMC NIC + after a node delete. + + Only available after the node has been added to MAAS. + """ + + url = self.interpolate_url() + + self.logger.debug("Resetting node power type for machine {}".format( + self.resource_id)) + self.power_type = 'manual' + power_params = {'power_type': 'manual'} + resp = self.api_client.put(url, files=power_params) + + if resp.status_code == 200: + return True + + raise errors.DriverError( + "Failed updating power parameters MAAS url {} - return code {}\n{}".format( + url, resp.status_code.resp.text)) + def to_dict(self): """Serialize this resource instance into a dict. diff --git a/drydock_provisioner/orchestrator/actions/orchestrator.py b/drydock_provisioner/orchestrator/actions/orchestrator.py index 3dddf2a9..44626967 100644 --- a/drydock_provisioner/orchestrator/actions/orchestrator.py +++ b/drydock_provisioner/orchestrator/actions/orchestrator.py @@ -202,10 +202,66 @@ class DestroyNodes(BaseAction): def start(self): """Start executing this action.""" - self.task.set_status(hd_fields.TaskStatus.Complete) - self.task.failure() + self.task.set_status(hd_fields.TaskStatus.Running) self.task.save() + node_driver = self._get_driver('node') + + if node_driver is None: + self.task.set_status(hd_fields.TaskStatus.Complete) + self.task.add_status_msg( + msg="No node driver enabled, ending task.", + error=True, + ctx=str(self.task.get_id()), + ctx_type='task') + self.task.result.set_message("No NodeDriver enabled.") + self.task.result.set_reason("Bad Configuration.") + self.task.failure() + self.task.save() + return + + target_nodes = self.orchestrator.get_target_nodes(self.task) + + if not target_nodes: + self.task.add_status_msg( + msg="No nodes in scope, no work to to do.", + error=False, + ctx='NA', + ctx_type='NA') + self.task.success() + self.task.set_status(hd_fields.TaskStatus.Complete) + self.task.save() + return + + node_release_task = None + while True: + if node_release_task is None: + node_release_task = self.orchestrator.create_task( + design_ref=self.task.design_ref, + action=hd_fields.OrchestratorAction.DestroyNode, + node_filter=self.task.node_filter) + self.task.register_subtask(node_release_task) + + self.logger.info( + "Starting node driver task %s to Release nodes." % + (node_release_task.get_id())) + node_driver.execute_task(node_release_task.get_id()) + + node_release_task = self.state_manager.get_task( + node_release_task.get_id()) + + try: + if not node_release_task.retry_task(max_attempts=3): + break + except errors.MaxRetriesReached: + self.task.failure() + break + + self.task.set_status(hd_fields.TaskStatus.Complete) + self.task.bubble_results( + action_filter=hd_fields.OrchestratorAction.DestroyNode) + self.task.align_result() + self.task.save() return