# Copyright 2017 AT&T Intellectual Property. All other rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Actions for the Orchestrator level of the Drydock workflow.""" import time import logging import concurrent.futures import uuid import drydock_provisioner.config as config import drydock_provisioner.error as errors import drydock_provisioner.objects.fields as hd_fields class BaseAction(object): """The base class for actions starts by the orchestrator.""" def __init__(self, task, orchestrator, state_manager): """Object initializer. :param task: objects.Task instance this action will execute against :param orchestrator: orchestrator.Orchestrator instance :param state_manager: state.DrydockState instnace used to access task state """ self.task = task self.orchestrator = orchestrator self.state_manager = state_manager self.logger = logging.getLogger( config.config_mgr.conf.logging.global_logger_name) def _parallelize_subtasks(self, fn, subtask_id_list, *args, **kwargs): """Spawn threads to execute fn for each subtask using concurrent.futures. Return a dictionary of task_id:concurrent.futures.Future instance :param fn: The callable to execute in a thread, expected it takes a task_id as first argument :param subtask_id_list: List of uuid.UUID ID of the subtasks to execute on :param *args: The args to pass to fn :param **kwargs: The kwargs to pass to fn """ task_futures = dict() with concurrent.futures.ThreadPoolExecutor(max_workers=16) as te: for t in subtask_id_list: task_futures[t.bytes] = te.submit(fn, t, *args, **kwargs) return task_futures def _collect_subtask_futures(self, subtask_futures, timeout=300): """Collect Futures executing on subtasks or timeout. Wait for Futures to finish or timeout. After timeout, enumerate the subtasks that timed out in task result messages. :param subtask_futures: dictionary of subtask_id.bytes -> Future instance :param timeout: The number of seconds to wait for all Futures to complete """ finished, timed_out = concurrent.futures.wait( subtask_futures.values(), timeout=timeout) self.task.align_result() for k, v in subtask_futures.items(): if not v.done(): self.task.add_status_msg( "Subtask thread for %s still executing after timeout." % str(uuid.UUID(bytes=k)), error=True, ctx=str(self.task.get_id()), ctx_type='task') self.task.failure() else: if v.exception(): self.logger.error( "Uncaught excetion in subtask %s future:" % str( uuid.UUID(bytes=k)), exc_info=v.exception()) if len(timed_out) > 0: raise errors.CollectSubtaskTimeout( "One or more subtask threads did not finish in %d seconds." % timeout) return def _load_site_design(self): """Load the site design from this action's task. The design_ref in the task can be resolved to a set of design documents that reflect the site design to be operated on. Load this design for use by this action. """ design_status, site_design = self.orchestrator.get_effective_site( self.task.design_ref) if design_status is None or design_status.status == hd_fields.ActionResult.Failure: raise errors.OrchestratorError("Site design failed load.") return site_design class Noop(BaseAction): """Dummy action to allow the full task completion flow without impacts.""" def start(self): """Start executing this action.""" self.logger.debug("Starting Noop Action.") self.task.set_status(hd_fields.TaskStatus.Running) self.task.save() time.sleep(5) self.task = self.state_manager.get_task(self.task.get_id()) if self.task.check_terminate(): self.logger.debug("Terminating action.") self.task.set_status(hd_fields.TaskStatus.Terminated) self.task.failure() self.task.add_status_msg( msg="Action terminated.", ctx_type='NA', ctx='NA', error=False) else: self.logger.debug("Marked task as successful.") self.task.set_status(hd_fields.TaskStatus.Complete) self.task.success() self.task.add_status_msg( msg="Noop action.", ctx_type='NA', ctx='NA', error=False) self.task.save() self.logger.debug("Saved task state.") self.logger.debug("Finished Noop Action.") return class DestroyNodes(BaseAction): """Action to destroy nodes in prepartion for a redeploy.""" def start(self): """Start executing this action.""" self.task.set_status(hd_fields.TaskStatus.Complete) self.task.failure() self.task.save() return class ValidateDesign(BaseAction): """Action for validating the design document referenced by the task.""" def start(self): """Start executing this action.""" self.task.set_status(hd_fields.TaskStatus.Running) self.task.save() try: status, site_design = self.orchestrator.get_effective_site( self.task.design_ref) self.task.merge_status_messages(task_result=status) self.task.set_status(hd_fields.TaskStatus.Complete) self.task.success() self.task.save() except Exception: self.task.set_status(hd_fields.TaskStatus.Complete) self.task.failure() self.task.save() return class VerifySite(BaseAction): """Action to verify downstream tools in the site are available and ready.""" def start(self): """Start executing this action in the context of the local task.""" self.task.set_status(hd_fields.TaskStatus.Running) self.task.save() node_driver = self.orchestrator.enabled_drivers['node'] if node_driver is None: self.task.set_status(hd_fields.TaskStatus.Complete) self.task.add_status_msg( msg="No node driver enabled, ending task.", error=True, ctx=str(self.task.get_id()), ctx_type='task') self.task.result.set_message("No NodeDriver enabled.") self.task.result.set_reason("Bad Configuration.") self.task.failure() self.task.save() return node_driver_task = self.orchestrator.create_task( design_ref=self.task.design_ref, action=hd_fields.OrchestratorAction.ValidateNodeServices) self.task.register_subtask(node_driver_task) node_driver.execute_task(node_driver_task.get_id()) node_driver_task = self.state_manager.get_task( node_driver_task.get_id()) self.task.add_status_msg( msg="Collected subtask %s" % str(node_driver_task.get_id()), error=False, ctx=str(node_driver_task.get_id()), ctx_type='task') self.task = self.state_manager.get_task(self.task.get_id()) self.task.set_status(hd_fields.TaskStatus.Complete) self.task.align_result() self.task.save() return class PrepareSite(BaseAction): """Action to configure site wide/inter-node settings.""" def start(self): """Start executing this action in the context of the local task.""" self.task.set_status(hd_fields.TaskStatus.Running) self.task.save() driver = self.orchestrator.enabled_drivers['node'] if driver is None: self.task.set_status(hd_fields.TaskStatus.Complete) self.task.add_status_msg( msg="No node driver enabled, ending task.", error=True, ctx=str(self.task.get_id()), ctx_type='task') self.task.result.set_message("No NodeDriver enabled.") self.task.result.set_reason("Bad Configuration.") self.task.failure() self.task.save() return site_network_task = self.orchestrator.create_task( design_ref=self.task.design_ref, action=hd_fields.OrchestratorAction.CreateNetworkTemplate) self.task.register_subtask(site_network_task) self.logger.info( "Starting node driver task %s to create network templates" % (site_network_task.get_id())) driver.execute_task(site_network_task.get_id()) self.task.add_status_msg( msg="Collected subtask %s" % str(site_network_task.get_id()), error=False, ctx=str(site_network_task.get_id()), ctx_type='task') self.logger.info("Node driver task %s complete" % (site_network_task.get_id())) user_creds_task = self.orchestrator.create_task( design_ref=self.task.design_ref, action=hd_fields.OrchestratorAction.ConfigureUserCredentials) self.task.register_subtask(user_creds_task) self.logger.info( "Starting node driver task %s to configure user credentials" % (user_creds_task.get_id())) driver.execute_task(user_creds_task.get_id()) self.task.add_status_msg( msg="Collected subtask %s" % str(user_creds_task.get_id()), error=False, ctx=str(user_creds_task.get_id()), ctx_type='task') self.logger.info("Node driver task %s complete" % (site_network_task.get_id())) self.task.align_result() self.task.set_status(hd_fields.TaskStatus.Complete) self.task.save() return class VerifyNodes(BaseAction): """Action to verify the orchestrator has adequate access to a node to start the deployment.""" def start(self): """Start executing this action.""" self.task.set_status(hd_fields.TaskStatus.Running) self.task.save() design_status, site_design = self.orchestrator.get_effective_site( self.task.design_ref) node_filter = self.task.node_filter oob_type_partition = {} target_nodes = self.orchestrator.process_node_filter( node_filter, site_design) for n in target_nodes: if n.oob_type not in oob_type_partition.keys(): oob_type_partition[n.oob_type] = [] oob_type_partition[n.oob_type].append(n) task_futures = dict() for oob_type, oob_nodes in oob_type_partition.items(): oob_driver = None for d in self.orchestrator.enabled_drivers['oob']: if d.oob_type_support(oob_type): oob_driver = d break if oob_driver is None: self.logger.warning( "Node OOB type %s has no enabled driver." % oob_type) self.task.failure() for n in oob_nodes: self.task.add_status_msg( msg="Node %s OOB type %s is not supported." % (n.get_name(), oob_type), error=True, ctx=n.get_name(), ctx_type='node') continue nf = self.orchestrator.create_nodefilter_from_nodelist(oob_nodes) oob_driver_task = self.orchestrator.create_task( design_ref=self.task.design_ref, action=hd_fields.OrchestratorAction.InterrogateOob, node_filter=nf) self.task.register_subtask(oob_driver_task) self.logger.info( "Starting task %s for node verification via OOB type %s" % (oob_driver_task.get_id(), oob_type)) task_futures.update( self._parallelize_subtasks(oob_driver.execute_task, [oob_driver_task.get_id()])) try: self._collect_subtask_futures( task_futures, timeout=(config.config_mgr.conf.timeouts.drydock_timeout * 60)) self.logger.debug( "Collected subtasks for task %s" % str(self.task.get_id())) except errors.CollectSubtaskTimeout as ex: self.logger.warning(str(ex)) self.task.set_status(hd_fields.TaskStatus.Complete) return class PrepareNodes(BaseAction): """Action to prepare a node for deployment.""" def start(self): """Start executing this action.""" self.task.set_status(hd_fields.TaskStatus.Running) self.task.save() # NOTE(sh8121att) Should we attempt to interrogate the node via Node # Driver to see if it is in a deployed state before we # start rebooting? Or do we just leverage # Drydock internal state via site build data (when implemented)? node_driver = self.orchestrator.enabled_drivers['node'] if node_driver is None: self.task.set_status(hd_fields.TaskStatus.Complete) self.task.add_status_msg( msg="No node driver enabled, ending task.", error=True, ctx=str(self.task.get_id()), ctx_type='task') self.task.result.set_message("No NodeDriver enabled.") self.task.result.set_reason("Bad Configuration.") self.task.failure() self.task.save() return design_status, site_design = self.orchestrator.get_effective_site( self.task.design_ref) target_nodes = self.orchestrator.process_node_filter( self.task.node_filter, site_design) oob_type_partition = {} for n in target_nodes: if n.oob_type not in oob_type_partition.keys(): oob_type_partition[n.oob_type] = [] oob_type_partition[n.oob_type].append(n) task_futures = dict() oob_type_filters = dict() for oob_type, oob_nodes in oob_type_partition.items(): oob_driver = None for d in self.orchestrator.enabled_drivers['oob']: if d.oob_type_support(oob_type): oob_driver = d break if oob_driver is None: self.logger.warning( "Node OOB type %s has no enabled driver." % oob_type) self.task.failure() for n in oob_nodes: self.task.add_status_msg( msg="Node %s OOB type %s is not supported." % (n.get_name(), oob_type), error=True, ctx=n.get_name(), ctx_type='node') continue oob_type_filters[ oob_type] = self.orchestrator.create_nodefilter_from_nodelist( oob_nodes) setboot_task = self.orchestrator.create_task( design_ref=self.task.design_ref, action=hd_fields.OrchestratorAction.SetNodeBoot, node_filter=oob_type_filters[oob_type]) self.task.register_subtask(setboot_task) self.logger.info( "Starting OOB driver task %s to set PXE boot for OOB type %s" % (setboot_task.get_id(), oob_type)) task_futures.update( self._parallelize_subtasks(oob_driver.execute_task, [setboot_task.get_id()])) try: self._collect_subtask_futures( task_futures, timeout=(config.config_mgr.conf.timeouts.drydock_timeout * 60)) # Get successful nodes and add it to the node filter # so the next step only happens for successfully configured nodes self.task.bubble_results( action_filter=hd_fields.OrchestratorAction.SetNodeBoot) for t, f in oob_type_filters.items(): oob_type_filters[t]['filter_set'].append( dict( filter_type='union', node_names=self.task.result.successes)) self.logger.debug( "Collected subtasks for task %s" % str(self.task.get_id())) except errors.CollectSubtaskTimeout as ex: self.logger.warning(str(ex)) task_futures = dict() for oob_type, oob_nodes in oob_type_partition.items(): oob_driver = None for d in self.orchestrator.enabled_drivers['oob']: if d.oob_type_support(oob_type): oob_driver = d break if oob_driver is None: self.logger.warning( "Node OOB type %s has no enabled driver." % oob_type) self.task.failure() for n in oob_nodes: self.task.add_status_msg( msg="Node %s OOB type %s is not supported." % (n.get_name(), oob_type), error=True, ctx=n.get_name(), ctx_type='node') continue cycle_task = self.orchestrator.create_task( design_ref=self.task.design_ref, action=hd_fields.OrchestratorAction.PowerCycleNode, node_filter=oob_type_filters[oob_type]) self.task.register_subtask(cycle_task) self.logger.info( "Starting OOB driver task %s to power cycle nodes for OOB type %s" % (cycle_task.get_id(), oob_type)) task_futures.update( self._parallelize_subtasks(oob_driver.execute_task, [cycle_task.get_id()])) try: self._collect_subtask_futures( task_futures, timeout=(config.config_mgr.conf.timeouts.drydock_timeout * 60)) # Get successful nodes and add it to the node filter # so the next step only happens for successfully configured nodes self.task.bubble_results( action_filter=hd_fields.OrchestratorAction.PowerCycleNode) for t, f in oob_type_filters.items(): oob_type_filters[t]['filter_set'].append( dict( filter_type='union', node_names=self.task.result.successes)) self.logger.debug( "Collected subtasks for task %s" % str(self.task.get_id())) except errors.CollectSubtaskTimeout as ex: self.logger.warning(str(ex)) # IdentifyNode success will take some time after PowerCycleNode finishes # Retry the operation a few times if it fails before considering it a final failure # Each attempt is a new task which might make the final task tree a bit confusing max_attempts = config.config_mgr.conf.timeouts.identify_node * ( 60 / config.config_mgr.conf.poll_interval) self.logger.debug( "Will make max of %d attempts to complete the identify_node task." % max_attempts) nf = self.task.node_filter_from_successes() node_identify_task = None while True: if node_identify_task is None: node_identify_task = self.orchestrator.create_task( design_ref=self.task.design_ref, action=hd_fields.OrchestratorAction.IdentifyNode, node_filter=nf) self.task.register_subtask(node_identify_task) self.logger.info( "Starting node driver task %s to identify nodes." % (node_identify_task.get_id())) node_driver.execute_task(node_identify_task.get_id()) node_identify_task = self.state_manager.get_task( node_identify_task.get_id()) node_identify_task.bubble_results() try: if not node_identify_task.retry_task( max_attempts=max_attempts): break time.sleep(config.config_mgr.conf.poll_interval) except errors.MaxRetriesReached: self.task.failure() break # We can only commission nodes that were successfully identified in the provisioner if len(node_identify_task.result.successes) > 0: target_nf = node_identify_task.node_filter_from_successes() self.logger.info( "Found %s successfully identified nodes, starting commissioning." % (len(node_identify_task.result.successes))) node_commission_task = None while True: if node_commission_task is None: node_commission_task = self.orchestrator.create_task( design_ref=self.task.design_ref, action=hd_fields.OrchestratorAction.ConfigureHardware, node_filter=target_nf) self.task.register_subtask(node_commission_task) self.logger.info( "Starting node driver task %s to commission nodes." % (node_commission_task.get_id())) node_driver.execute_task(node_commission_task.get_id()) node_commission_task = self.state_manager.get_task( node_commission_task.get_id()) try: if not node_commission_task.retry_task(max_attempts=3): break except errors.MaxRetriesReached: self.task.failure() break else: self.logger.warning( "No nodes successfully identified, skipping commissioning subtask" ) self.task.align_result() self.task.set_status(hd_fields.TaskStatus.Complete) self.task.save() return class DeployNodes(BaseAction): """Action to deploy a node with a persistent OS.""" def start(self): """Start executing this action.""" self.task.set_status(hd_fields.TaskStatus.Running) self.task.save() node_driver = self.orchestrator.enabled_drivers['node'] if node_driver is None: self.task.set_status(hd_fields.TaskStatus.Complete) self.task.add_status_msg( msg="No node driver enabled, ending task.", error=True, ctx=str(self.task.get_id()), ctx_type='task') self.task.result.set_message("No NodeDriver enabled.") self.task.result.set_reason("Bad Configuration.") self.task.failure() self.task.save() return node_networking_task = self.orchestrator.create_task( design_ref=self.task.design_ref, action=hd_fields.OrchestratorAction.ApplyNodeNetworking, node_filter=self.task.node_filter) self.task.register_subtask(node_networking_task) self.logger.info( "Starting node driver task %s to apply networking on nodes." % (node_networking_task.get_id())) node_driver.execute_task(node_networking_task.get_id()) node_networking_task = self.state_manager.get_task( node_networking_task.get_id()) node_storage_task = None if len(node_networking_task.result.successes) > 0: self.logger.info( "Found %s successfully networked nodes, configuring storage." % (len(node_networking_task.result.successes))) node_storage_task = self.orchestrator.create_task( design_ref=self.task.design_ref, action=hd_fields.OrchestratorAction.ApplyNodeStorage, node_filter=node_networking_task.node_filter_from_successes()) self.logger.info( "Starting node driver task %s to configure node storage." % (node_storage_task.get_id())) node_driver.execute_task(node_storage_task.get_id()) node_storage_task = self.state_manager.get_task( node_storage_task.get_id()) else: self.logger.warning( "No nodes successfully networked, skipping storage configuration subtask." ) node_platform_task = None if (node_storage_task is not None and len(node_storage_task.result.successes) > 0): self.logger.info( "Configured storage on %s nodes, configuring platform." % (len(node_storage_task.result.successes))) node_platform_task = self.orchestrator.create_task( design_ref=self.task.design_ref, action=hd_fields.OrchestratorAction.ApplyNodePlatform, node_filter=node_storage_task.node_filter_from_successes()) self.task.register_subtask(node_platform_task) self.logger.info( "Starting node driver task %s to configure node platform." % (node_platform_task.get_id())) node_driver.execute_task(node_platform_task.get_id()) node_platform_task = self.state_manager.get_task( node_platform_task.get_id()) else: self.logger.warning( "No nodes with storage configuration, skipping platform configuration subtask." ) node_deploy_task = None if node_platform_task is not None and len( node_platform_task.result.successes) > 0: self.logger.info( "Configured platform on %s nodes, starting deployment." % (len(node_platform_task.result.successes))) while True: if node_deploy_task is None: node_deploy_task = self.orchestrator.create_task( design_ref=self.task.design_ref, action=hd_fields.OrchestratorAction.DeployNode, node_filter=node_platform_task. node_filter_from_successes()) self.logger.info( "Starting node driver task %s to deploy nodes." % (node_deploy_task.get_id())) node_driver.execute_task(node_deploy_task.get_id()) node_deploy_task = self.state_manager.get_task( node_deploy_task.get_id()) try: if not node_deploy_task.retry_task(max_attempts=3): break except errors.MaxRetriesReached: self.task.failure() break else: self.logger.warning( "Unable to configure platform on any nodes, skipping deploy subtask" ) self.task.set_status(hd_fields.TaskStatus.Complete) self.task.align_result() self.task.save() return