# Copyright 2018 AT&T Intellectual Property. All other rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Prepare and deploy nodes using Drydock Uses the deployment strategy named in the deployment-configuration to progress through preparation and deployment of nodes in a group-based fashion. In the case of no specified deployment strategy, an "all-at-once" approach is taken, by which all nodes are deployed together. Historical Note: This operator replaces the function of drydock_prepare_nodes and drydock_deploy_nodes operators that existed previously. """ import logging import time from airflow.exceptions import AirflowException from airflow.plugins_manager import AirflowPlugin from shipyard_airflow.common.deployment_group.deployment_group import Stage from shipyard_airflow.common.deployment_group.deployment_group_manager import \ DeploymentGroupManager from shipyard_airflow.common.deployment_group.node_lookup import NodeLookup try: import check_k8s_node_status from drydock_base_operator import DrydockBaseOperator from drydock_base_operator import gen_node_name_filter from drydock_errors import ( DrydockTaskFailedException, DrydockTaskTimeoutException ) except ImportError: from shipyard_airflow.plugins import check_k8s_node_status from shipyard_airflow.plugins.drydock_base_operator import \ DrydockBaseOperator from shipyard_airflow.plugins.drydock_base_operator import \ gen_node_name_filter from shipyard_airflow.plugins.drydock_errors import ( DrydockTaskFailedException, DrydockTaskTimeoutException ) LOG = logging.getLogger(__name__) class DrydockNodesOperator(DrydockBaseOperator): """Drydock Nodes Operator Using a deployment strategy to calculate the deployment sequence, deploy a series of baremetal nodes using Drydock. """ def do_execute(self): self._setup_configured_values() # setup self.strategy self.strategy = self.get_deployment_strategy() dgm = _get_deployment_group_manager( self.strategy['groups'], _get_node_lookup(self.drydock_client, self.design_ref) ) _process_deployment_groups(dgm, self._execute_prepare, self._execute_deployment) # All groups "complete" (as they're going to be). Report summary dgm.report_group_summary() dgm.report_node_summary() if dgm.critical_groups_failed(): raise AirflowException( "One or more deployment groups marked as critical have failed" ) else: LOG.info("All critical groups have met their success criteria") # TODO (bryan-strassner) it is very possible that many nodes failed # deployment, but all critical groups had enough success to # continue processing. This will be non-obvious to the casual # observer of the workflow. A likely enhancement is to allow # notes be added to the shipyard action associated with this # workflow that would be reported back to the end user doing a # describe of the action. This will require new database structures # to hold the notes, and a means to insert the notes. A shared # functionality in the base ucp operator or a common module would # be a reasonable way to support this. def _setup_configured_values(self): """Sets self. values from the deployment configuration""" # Retrieve query intervals and timeouts # Intervals - How often will something be queried for status. self.dep_interval = self.dc['physical_provisioner.deploy_interval'] self.node_st_interval = self.dc['kubernetes.node_status_interval'] self.prep_interval = self.dc[ 'physical_provisioner.prepare_node_interval' ] # Timeouts - Time Shipyard waits for completion of a task. self.dep_timeout = self.dc['physical_provisioner.deploy_timeout'] self.node_st_timeout = self.dc['kubernetes.node_status_timeout'] self.prep_timeout = self.dc[ 'physical_provisioner.prepare_node_timeout' ] # The time to wait before querying k8s nodes after Drydock deploy nodes self.join_wait = self.dc['physical_provisioner.join_wait'] def _execute_prepare(self, group): """Executes the prepare nodes step for the group. :param group: the DeploymentGroup to prepare Returns a QueryTaskResult object """ LOG.info("Group %s is preparing nodes", group.name) self.node_filter = gen_node_name_filter(group.actionable_nodes) return self._execute_task('prepare_nodes', self.prep_interval, self.prep_timeout) def _execute_deployment(self, group): """Execute the deployment of nodes for the group. :param group: The DeploymentGroup to deploy Returns a QueryTaskResult object """ LOG.info("Group %s is deploying nodes", group.name) self.node_filter = gen_node_name_filter(group.actionable_nodes) task_result = self._execute_task('deploy_nodes', self.dep_interval, self.dep_timeout) if not task_result.successes: # if there are no successes from Drydock, there is no need to # wait and check on the results from node status. LOG.info("There are no nodes indicated as successful from Drydock." " Skipping waiting for Kubernetes node join and " "proceeding to validation") return task_result # It takes time for the cluster join process to be triggered across # all the nodes in the cluster. Hence there is a need to back off # and wait before checking the state of the cluster join process. LOG.info("Nodes <%s> reported as deployed in MAAS", ", ".join(task_result.successes)) LOG.info("Waiting for %d seconds before checking node state...", self.join_wait) time.sleep(self.join_wait) # Check that cluster join process is completed before declaring # deploy_node as 'completed'. # This should only include nodes that drydock has indicated as # successful and has passed the join script to. # Anything not ready in the timeout needs to be considered a failure not_ready_list = check_k8s_node_status.check_node_status( self.node_st_timeout, self.node_st_interval, expected_nodes=task_result.successes ) for node in not_ready_list: # Remove nodes that are not ready from the list of successes, since # they did not complete deployment successfully. try: LOG.info("Node %s failed to join the Kubernetes cluster or was" " not timely enough", node) task_result.successes.remove(node) except (ValueError, KeyError): # This node is not joined, but was not one that we were # looking for either. LOG.info("%s failed to join Kubernetes, but was not in the " "Drydock results: %s", node, ", ".join(task_result.successes)) return task_result def _execute_task(self, task_name, interval, timeout): """Execute the Drydock task requested :param task_name: 'prepare_nodes', 'deploy_nodes' :param interval: The time between checking status on the task :param timeout: The total time allowed for the task Wraps the query_task method in the base class, capturing AirflowExceptions and summarizing results into a response QueryTaskResult object Note: It does not matter if the task ultimately succeeds or fails in Drydock - the base class will handle all the logging and etc for the purposes of troubleshooting. What matters is the node successes. Following any result of query_task, this code will re-query the task results from Drydock to gather the node successes placing them into the successes list in the response object. In the case of a failure to get the task results, this workflow must assume that the result is a total loss, and pass back no successes """ self.create_task(task_name) result = QueryTaskResult(self.drydock_task_id, task_name) try: self.query_task(interval, timeout) except DrydockTaskFailedException: # Task failure may be successful enough based on success criteria. # This should not halt the overall flow of this workflow step. LOG.warn( "Task %s has failed. Logs contain details of the failure. " "Some nodes may be succesful, processing continues", task_name ) except DrydockTaskTimeoutException: # Task timeout may be successful enough based on success criteria. # This should not halt the overall flow of this workflow step. LOG.warn( "Task %s has timed out after %s seconds. Logs contain details " "of the failure. Some nodes may be succesful, processing " "continues", task_name, timeout ) # Other AirflowExceptions will fail the whole task - let them do this. # find successes result.successes = self.get_successes_for_task(self.drydock_task_id) return result def get_deployment_strategy(self): """Determine the deployment strategy Uses the specified strategy from the deployment configuration or returns a default configuration of 'all-at-once' """ if self.target_nodes: # Set up a strategy with one group with the list of nodes, so those # nodes are the only nodes processed. LOG.info("Seting up deployment strategy using targeted nodes") strat_name = 'targeted nodes' strategy = gen_simple_deployment_strategy(name='target-group', nodes=self.target_nodes) else: # Otherwise, do a strategy for the site - either from the # configdocs or a default "everything". strat_name = self.dc['physical_provisioner.deployment_strategy'] if strat_name: # if there is a deployment strategy specified, use it strategy = self.get_unique_doc( name=strat_name, schema="shipyard/DeploymentStrategy/v1" ) else: # The default behavior is to deploy all nodes, and fail if # any nodes fail to deploy. strat_name = 'all-at-once (defaulted)' strategy = gen_simple_deployment_strategy() LOG.info("Strategy Name: %s has %s groups", strat_name, len(strategy.get('groups', []))) return strategy # # Functions supporting the nodes operator class # def gen_simple_deployment_strategy(name=None, nodes=None): """Generates a single group deployment strategy :param name: the name of the single group. Defaults to 'default' :param nodes: the list of node_names to be used. Defaults to [] """ target_name = name or 'default' target_nodes = list(nodes) if nodes else [] return { 'groups': [ { 'name': target_name, 'critical': True, 'depends_on': [], 'selectors': [ { 'node_names': target_nodes, 'node_labels': [], 'node_tags': [], 'rack_names': [], }, ], 'success_criteria': { 'percent_successful_nodes': 100 }, } ] } def _get_node_lookup(drydock_client, design_ref): """Return a NodeLookup suitable for the DeploymentGroupManager :param drydock_client: the drydock_client object :param design_ref: the design_ref for the NodeLookup """ return NodeLookup(drydock_client, design_ref).lookup def _get_deployment_group_manager(groups_dict_list, node_lookup): """Return a DeploymentGroupManager suitable for managing this deployment :param groups_dict_list: the list of group dictionaries to use :param node_lookup: a NodeLookup object that will be used by this DeploymentGroupManager """ return DeploymentGroupManager(groups_dict_list, node_lookup) def _process_deployment_groups(dgm, prepare_func, deploy_func): """Executes the deployment group deployments :param dgm: the DeploymentGroupManager object that manages the dependency chain of groups :param prepare_func: a function that accepts a DeploymentGroup and returns a QueryTaskResult with the purpose of preparing nodes :param deploy_func: a function that accepts a DeploymentGroup and returns a QueryTaskResult with the purpose of deploying nodes """ complete = False while not complete: # Find the next group to be prepared. Prepare and deploy it. group = dgm.get_next_group(Stage.PREPARED) if group is None: LOG.info("There are no more groups eligible to process") # whether or not really complete, the processing loop is done. complete = True continue LOG.info("*** Deployment Group: %s is being processed ***", group.name) if not group.actionable_nodes: LOG.info("There were no actionable nodes for group %s. It is " "possible that all nodes: [%s] have previously been " "deployed. Group will be immediately checked " "against its success criteria", group.name, ", ".join(group.full_nodes)) # In the case of a group having no actionable nodes, since groups # prepare -> deploy in direct sequence, we can check against # deployment, since all nodes would need to be deployed or have # been attempted. Need to follow the state-transition, so # PREPARED -> DEPLOYED dgm.evaluate_group_succ_criteria(group.name, Stage.PREPARED) dgm.evaluate_group_succ_criteria(group.name, Stage.DEPLOYED) # success or failure, move on to next group continue LOG.info("%s has actionable nodes: [%s]", group.name, ", ".join(group.actionable_nodes)) if len(group.actionable_nodes) < len(group.full_nodes): LOG.info("Some nodes are not actionable because they were " "included in a prior group, but will be considered in " "the success critera calculation for this group") # Group has actionable nodes. # Prepare Nodes for group, store QueryTaskResults prep_qtr = prepare_func(group) # Mark successes as prepared for node_name in prep_qtr.successes: dgm.mark_node_prepared(node_name) dgm.fail_unsuccessful_nodes(group, prep_qtr.successes) should_deploy = dgm.evaluate_group_succ_criteria(group.name, Stage.PREPARED) if not should_deploy: # group has failed, move on to next group. Current group has # been marked as failed. continue # Continue with deployment dep_qtr = deploy_func(group) # Mark successes as deployed for node_name in dep_qtr.successes: dgm.mark_node_deployed(node_name) dgm.fail_unsuccessful_nodes(group, dep_qtr.successes) dgm.evaluate_group_succ_criteria(group.name, Stage.DEPLOYED) class QueryTaskResult: """Represents a summarized query result from a task""" def __init__(self, task_id, task_name): self.task_id = task_id self.task_name = task_name # The succeeded node names self.successes = [] class DrydockNodesOperatorPlugin(AirflowPlugin): """Creates DrydockPrepareNodesOperator in Airflow.""" name = 'drydock_nodes_operator' operators = [DrydockNodesOperator]