From 6e342060df52ff7cacf1e55e89cdfd6d460677c8 Mon Sep 17 00:00:00 2001 From: Scott Hussey Date: Wed, 14 Jun 2017 11:06:19 -0500 Subject: [PATCH] DRYD2 - MVP - Phase 2 Add Driver tasks for ConfigureHardware (equal to commission stage in MaaS) --- .../drivers/node/maasdriver/driver.py | 167 ++++++++++++++++-- .../drivers/node/maasdriver/models/base.py | 2 +- .../drivers/node/maasdriver/models/machine.py | 12 +- drydock_provisioner/orchestrator/__init__.py | 31 +++- 4 files changed, 195 insertions(+), 17 deletions(-) diff --git a/drydock_provisioner/drivers/node/maasdriver/driver.py b/drydock_provisioner/drivers/node/maasdriver/driver.py index f7079dcc..559316b7 100644 --- a/drydock_provisioner/drivers/node/maasdriver/driver.py +++ b/drydock_provisioner/drivers/node/maasdriver/driver.py @@ -22,6 +22,7 @@ import drydock_provisioner.objects.task as task_model from drydock_provisioner.drivers.node import NodeDriver from .api_client import MaasRequestFactory + import drydock_provisioner.drivers.node.maasdriver.models.fabric as maas_fabric import drydock_provisioner.drivers.node.maasdriver.models.vlan as maas_vlan import drydock_provisioner.drivers.node.maasdriver.models.subnet as maas_subnet @@ -133,7 +134,7 @@ class MaasNodeDriver(NodeDriver): 'retry': False, 'detail': 'MaaS Network creation timed-out' } - self.logger.warn("Thread for task %s timed out after 120s" % (subtask.get_id())) + self.logger.warning("Thread for task %s timed out after 120s" % (subtask.get_id())) self.orchestrator.task_field_update(task.get_id(), status=hd_fields.TaskStatus.Complete, result=hd_fields.ActionResult.Failure, @@ -153,7 +154,9 @@ class MaasNodeDriver(NodeDriver): subtasks = [] result_detail = { - 'detail': [] + 'detail': [], + 'failed_nodes': [], + 'successful_nodes': [], } for n in task.node_list: @@ -183,25 +186,95 @@ class MaasNodeDriver(NodeDriver): if subtask.status == hd_fields.TaskStatus.Complete: self.logger.info("Task %s to identify node %s complete - status %s" % (subtask.get_id(), n, subtask.get_result())) - - result_detail['detail'].extend(subtask.result_detail['detail']) running_subtasks = running_subtasks - 1 - if subtask.result in [hd_fields.ActionResult.Success, - hd_fields.ActionResult.PartialSuccess]: + if subtask.result == hd_fields.ActionResult.Success: + result_detail['successful_nodes'].extend(subtask.node_list) worked = True - elif subtask.result in [hd_fields.ActionResult.Failure, - hd_fields.ActionResult.PartialSuccess]: + elif subtask.result == hd_fields.ActionResult.Failure: + result_detail['failed_nodes'].extend(subtask.node_list) failed = True + elif subtask.result == hd_fields.ActionResult.PartialSuccess: + worked = failed = True time.sleep(1 * 60) attempts = attempts + 1 if running_subtasks > 0: - self.logger.warn("Time out for task %s before all subtask threads complete" % (task.get_id())) + self.logger.warning("Time out for task %s before all subtask threads complete" % (task.get_id())) result = hd_fields.ActionResult.DependentFailure result_detail['detail'].append('Some subtasks did not complete before the timeout threshold') - if worked and failed: + elif worked and failed: + result = hd_fields.ActionResult.PartialSuccess + elif worked: + result = hd_fields.ActionResult.Success + else: + result = hd_fields.ActionResult.Failure + + self.orchestrator.task_field_update(task.get_id(), + status=hd_fields.TaskStatus.Complete, + result=result, + result_detail=result_detail) + elif task.action == hd_fields.OrchestratorAction.ConfigureHardware: + self.orchestrator.task_field_update(task.get_id(), + status=hd_fields.TaskStatus.Running) + + self.logger.debug("Starting subtask to commissiong %s nodes." % (len(task.node_list))) + + subtasks = [] + + result_detail = { + 'detail': [], + 'failed_nodes': [], + 'successful_nodes': [], + } + + for n in task.node_list: + subtask = self.orchestrator.create_task(task_model.DriverTask, + parent_task_id=task.get_id(), design_id=design_id, + action=hd_fields.OrchestratorAction.ConfigureHardware, + site_name=task.site_name, + task_scope={'site': task.site_name, 'node_names': [n]}) + runner = MaasTaskRunner(state_manager=self.state_manager, + orchestrator=self.orchestrator, + task_id=subtask.get_id(),config=self.config) + + self.logger.info("Starting thread for task %s to commission node %s" % (subtask.get_id(), n)) + + runner.start() + subtasks.append(subtask.get_id()) + + running_subtasks = len(subtasks) + attempts = 0 + worked = failed = False + + #TODO Add timeout to config + while running_subtasks > 0 and attempts < 20: + for t in subtasks: + subtask = self.state_manager.get_task(t) + + if subtask.status == hd_fields.TaskStatus.Complete: + self.logger.info("Task %s to commission node %s complete - status %s" % + (subtask.get_id(), n, subtask.get_result())) + running_subtasks = running_subtasks - 1 + + if subtask.result == hd_fields.ActionResult.Success: + result_detail['successful_nodes'].extend(subtask.node_list) + worked = True + elif subtask.result == hd_fields.ActionResult.Failure: + result_detail['failed_nodes'].extend(subtask.node_list) + failed = True + elif subtask.result == hd_fields.ActionResult.PartialSuccess: + worked = failed = True + + time.sleep(1 * 60) + attempts = attempts + 1 + + if running_subtasks > 0: + self.logger.warning("Time out for task %s before all subtask threads complete" % (task.get_id())) + result = hd_fields.ActionResult.DependentFailure + result_detail['detail'].append('Some subtasks did not complete before the timeout threshold') + elif worked and failed: result = hd_fields.ActionResult.PartialSuccess elif worked: result = hd_fields.ActionResult.Success @@ -440,8 +513,80 @@ class MaasTaskRunner(drivers.DriverTaskRunner): status=hd_fields.TaskStatus.Complete, result=result, result_detail=result_detail) + elif task_action == hd_fields.OrchestratorAction.ConfigureHardware: + try: + machine_list = maas_machine.Machines(self.maas_client) + machine_list.refresh() + except: + self.orchestrator.task_field_update(self.task.get_id(), + status=hd_fields.TaskStatus.Complete, + result=hd_fields.ActionResult.Failure, + result_detail={'detail': 'Error accessing MaaS Machines API', 'retry': True}) + return + nodes = self.task.node_list + result_detail = {'detail': []} + worked = failed = False - + # TODO Better way of representing the node statuses than static strings + for n in nodes: + try: + self.logger.debug("Locating node %s for commissioning" % (n)) + node = site_design.get_baremetal_node(n) + machine = machine_list.identify_baremetal_node(node, update_name=False) + if machine is not None: + if machine.status_name == 'New': + self.logger.debug("Located node %s in MaaS, starting commissioning" % (n)) + machine.commission() + + # Poll machine status + attempts = 0 + + while attempts < 20 and machine.status_name != 'Ready': + attempts = attempts + 1 + time.sleep(1 * 60) + try: + machine.refresh() + self.logger.debug("Polling node %s status attempt %d: %s" % (n, attempts, machine.status_name)) + except: + self.logger.warning("Error updating node %s status during commissioning, will re-attempt." % + (n)) + if machine.status_name == 'Ready': + self.logger.info("Node %s commissioned." % (n)) + result_detail['detail'].append("Node %s commissioned" % (n)) + worked = True + elif machine.status_name == 'Commissioning': + self.logger.info("Located node %s in MaaS, node already being commissioned. Skipping..." % (n)) + result_detail['detail'].append("Located node %s in MaaS, node already being commissioned. Skipping..." % (n)) + worked = True + elif machine.status_name == 'Ready': + self.logger.info("Located node %s in MaaS, node commissioned. Skipping..." % (n)) + result_detail['detail'].append("Located node %s in MaaS, node commissioned. Skipping..." % (n)) + worked = True + else: + self.logger.warning("Located node %s in MaaS, unknown status %s. Skipping..." % (n, machine.status_name)) + result_detail['detail'].append("Located node %s in MaaS, node commissioned. Skipping..." % (n)) + failed = True + else: + self.logger.warning("Node %s not found in MaaS" % n) + failed = True + result_detail['detail'].append("Node %s not found in MaaS" % n) + + except Exception as ex: + failed = True + result_detail['detail'].append("Error commissioning node %s: %s" % (n, str(ex))) + + result = None + if worked and failed: + result = hd_fields.ActionResult.PartialSuccess + elif worked: + result = hd_fields.ActionResult.Success + elif failed: + result = hd_fields.ActionResult.Failure + + self.orchestrator.task_field_update(self.task.get_id(), + status=hd_fields.TaskStatus.Complete, + result=result, + result_detail=result_detail) \ No newline at end of file diff --git a/drydock_provisioner/drivers/node/maasdriver/models/base.py b/drydock_provisioner/drivers/node/maasdriver/models/base.py index 7d6cbd49..6082d964 100644 --- a/drydock_provisioner/drivers/node/maasdriver/models/base.py +++ b/drydock_provisioner/drivers/node/maasdriver/models/base.py @@ -44,7 +44,7 @@ class ResourceBase(object): updated_fields = resp.json() - for f in self.json_fields: + for f in self.fields: if f in updated_fields.keys(): setattr(self, f, updated_fields.get(f)) diff --git a/drydock_provisioner/drivers/node/maasdriver/models/machine.py b/drydock_provisioner/drivers/node/maasdriver/models/machine.py index f4ec3609..ae32a37b 100644 --- a/drydock_provisioner/drivers/node/maasdriver/models/machine.py +++ b/drydock_provisioner/drivers/node/maasdriver/models/machine.py @@ -21,7 +21,7 @@ class Machine(model_base.ResourceBase): resource_url = 'machines/{resource_id}/' fields = ['resource_id', 'hostname', 'power_type', 'power_state', 'power_parameters', 'interfaces', - 'boot_interface', 'memory', 'cpu_count', 'tag_names'] + 'boot_interface', 'memory', 'cpu_count', 'tag_names', 'status_name'] json_fields = ['hostname', 'power_type'] def __init__(self, api_client, **kwargs): @@ -31,6 +31,8 @@ class Machine(model_base.ResourceBase): if getattr(self, 'resource_id', None) is not None: self.interfaces = maas_interface.Interfaces(api_client, system_id=self.resource_id) self.interfaces.refresh() + else: + self.interfaces = None def get_power_params(self): url = self.interpolate_url() @@ -54,6 +56,11 @@ class Machine(model_base.ResourceBase): if not resp.ok: raise Exception() + def get_network_interface(self, iface_name): + if self.interfaces is not None: + iface = self.interfaces.singleton({'name': iface_name}) + return iface + def get_details(self): url = self.interpolate_url() @@ -142,7 +149,8 @@ class Machines(model_base.ResourceCollectionBase): maas_node.hostname = node_model.name maas_node.update() self.logger.debug("Updated MaaS resource %s hostname to %s" % (maas_node.resource_id, node_model.name)) - return maas_node + + return maas_node except ValueError as ve: self.logger.warn("Error locating matching MaaS resource for OOB IP %s" % (node_oob_ip)) diff --git a/drydock_provisioner/orchestrator/__init__.py b/drydock_provisioner/orchestrator/__init__.py index 69984336..5b9ceb74 100644 --- a/drydock_provisioner/orchestrator/__init__.py +++ b/drydock_provisioner/orchestrator/__init__.py @@ -314,11 +314,35 @@ class Orchestrator(object): elif node_identify_task.get_result() in [hd_fields.ActionResult.PartialSuccess, hd_fields.ActionResult.Failure]: # TODO This threshold should be a configurable default and tunable by task API - if node_identify_attempts > 2: + if node_identify_attempts > 10: failed = True break - time.sleep(5 * 60) + time.sleep(1 * 60) + + # We can only commission nodes that were successfully identified in the provisioner + if len(node_identify_task.result_detail['successful_nodes']) > 0: + self.logger.info("Found %s successfully identified nodes, starting commissioning." % + (len(node_identify_task.result_detail['successful_nodes']))) + node_commission_task = self.create_task(tasks.DriverTask, + parent_task_id=task.get_id(), design_id=design_id, + action=hd_fields.OrchestratorAction.ConfigureHardware, + task_scope={'site': task_site, + 'node_names': node_identify_task.result_detail['successful_nodes']}) + + self.logger.info("Starting node driver task %s to commission nodes." % (node_commission_task.get_id())) + node_driver.execute_task(node_commission_task.get_id()) + + node_commission_task = self.state_manager.get_task(node_commission_task.get_id()) + + if node_commission_task.get_result() in [hd_fields.ActionResult.Success, + hd_fields.ActionResult.PartialSuccess]: + worked = True + elif node_commission_task.get_result() in [hd_fields.ActionResult.Failure, + hd_fields.ActionResult.PartialSuccess]: + failed = True + else: + self.logger.warning("No nodes successfully identified, skipping commissioning subtask") final_result = None if worked and failed: @@ -327,7 +351,8 @@ class Orchestrator(object): final_result = hd_fields.ActionResult.Success else: final_result = hd_fields.ActionResult.Failure - + + self.task_field_update(task_id, status=hd_fields.TaskStatus.Complete, result=final_result)