shipyard/src/bin/shipyard_airflow/shipyard_airflow/common/deployment_group/deployment_group.py

362 lines
13 KiB
Python

# Copyright 2018 AT&T Intellectual Property. All other rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Deployment group module
Encapsulates classes and functions that provide core deployment group
functionality used during baremetal provisioning.
"""
from enum import Enum
import logging
import operator
from .errors import DeploymentGroupLabelFormatError
from .errors import DeploymentGroupStageError
from .errors import InvalidDeploymentGroupError
from .errors import InvalidDeploymentGroupNodeLookupError
LOG = logging.getLogger(__name__)
def check_label_format(label_string):
"""Validates that a label_string is in key:value format.
Raises DeploymentGroupLabelFormatError if the value is not compliant.
"""
split = label_string.split(":")
if not len(split) == 2:
raise DeploymentGroupLabelFormatError(
"Label {} is formatted incorrectly. One : (colon) character is "
"required, and the label must be in key:value format".format(
label_string)
)
for v in split:
if v.strip() == "":
raise DeploymentGroupLabelFormatError(
"Label {} is formatted incorrectly. The values on either side "
"of the colon character must not be empty.".format(
label_string)
)
class Stage(Enum):
"""Valid values for baremetal node and deployment group stages of
deployment
"""
# A node that has not yet started deployment. The default.
NOT_STARTED = 'NOT_STARTED'
# A node that has finished the prepare_node stage successfully
PREPARED = 'PREPARED'
# A node that has finished the deploy_node stage successfully
DEPLOYED = 'DEPLOYED'
# A node that has failed to complete in any step.
FAILED = 'FAILED'
@classmethod
def is_complete(cls, stage):
return stage in [cls.DEPLOYED, cls.FAILED]
@classmethod
def previous_stage(cls, stage):
"""The valid states before the supplied state"""
if stage == cls.NOT_STARTED:
return []
if stage == cls.PREPARED:
return [cls.NOT_STARTED]
if stage == cls.DEPLOYED:
return [cls.PREPARED]
if stage == cls.FAILED:
return [cls.NOT_STARTED, cls.PREPARED]
else:
raise DeploymentGroupStageError("{} is not a valid stage".format(
str(stage)))
class GroupNodeSelector:
"""GroupNodeSelector object
:param selector_dict: dictionary representing the possible selector values
Encapsulates the criteria defining the selector for a deployment group.
Example selector_dict::
{
'node_names': [],
'node_labels': [],
'node_tags': ['control'],
'rack_names': ['rack03'],
}
"""
def __init__(self, selector_dict):
self.node_names = selector_dict.get('node_names', [])
self.node_labels = selector_dict.get('node_labels', [])
self.node_tags = selector_dict.get('node_tags', [])
self.rack_names = selector_dict.get('rack_names', [])
for label in self.node_labels:
check_label_format(label)
# A selector is an "all_selector" if there are no criteria specified.
self.all_selector = not any([self.node_names, self.node_labels,
self.node_tags, self.rack_names])
if self.all_selector:
LOG.debug("Selector values select all available nodes")
def get_node_labels_as_dict(self):
return {label.split(':')[0].strip(): label.split(':')[1].strip()
for label in self.node_labels}
class SuccessCriteria:
"""Defines the success criteria for a deployment group
:param criteria: a dictionary containing up to 3 fields in
percent_successful_nodes, minimum_successful_nodes,
maximum_failed_nodes
If no criteria are specified, all results are considered a success
"""
def __init__(self, criteria):
if not criteria:
self._always_succeed = True
return
self._always_succeed = False
# set the criteria or let them be None
self.pct_succ_nodes = criteria.get('percent_successful_nodes')
self.min_succ_nodes = criteria.get('minimum_successful_nodes')
self.max_failed_nodes = criteria.get('maximum_failed_nodes')
def get_failed(self, succ_list, all_nodes_list):
"""Determine which criteria have failed.
:param succ_list: A list of names of nodes that have successfully
completed a stage
:param all_nodes_list: A list of all node names that are to be
evaluated against.
Using the provided list of successful nodes, and the list of all
nodes, check which of the success criteria have failed to have been
met.
"""
failures = []
# If no criteria, or list of all nodes is empty, return empty list
if self._always_succeed or len(all_nodes_list) == 0:
return failures
succ_set = set(succ_list)
all_set = set(all_nodes_list)
all_size = len(all_set)
succ_size = len(succ_set.intersection(all_set))
fail_size = len(all_set.difference(succ_set))
actual_pct_succ = succ_size / all_size * 100
failures.extend(self._check("percent_successful_nodes",
actual_pct_succ, operator.ge,
self.pct_succ_nodes))
failures.extend(self._check("minimum_successful_nodes", succ_size,
operator.ge, self.min_succ_nodes))
failures.extend(self._check("maximum_failed_nodes", fail_size,
operator.le, self.max_failed_nodes))
return failures
def _check(self, name, actual, op, needed):
"""Evaluates a single criteria
:param name: name of the check
:param actual: the result that was achieved (LHS)
:param op: operator used for comparison
:param needed: the threshold of success (RHS). If this parameter
is None, the criteria is ignored as "successful" because it
was not set as a needed criteria
Returns a list containing the failure dictionary if the comparison
fails or and empty list if check is successful.
"""
if needed is None:
LOG.info(" - %s criteria not specified, not evaluated", name)
return []
if op(actual, needed):
LOG.info(" - %s succeeded, %s %s %s", name, actual, op.__name__,
needed)
return []
else:
fail = {"criteria": name, "needed": needed, "actual": actual}
LOG.info(" - %s failed, %s %s %s", name, actual, op.__name__,
needed)
return [fail]
class DeploymentGroup:
"""DeploymentGroup object representing a deployment group
:param group_dict: dictionary representing a group
:param node_lookup: an injected function that will perform node lookup for
a group. Function must accept an iterable of GroupNodeSelector and
return a string iterable of node names (or empty iterable if there are
no node names)
Example group_dict::
{
'name': 'control-nodes',
'critical': True,
'depends_on': ['ntp-node'],
'selectors': [
{
'node_names': [],
'node_labels': [],
'node_tags': ['control'],
'rack_names': ['rack03'],
},
],
'success_criteria': {
'percent_successful_nodes': 90,
'minimum_successful_nodes': 3,
'maximum_failed_nodes': 1,
},
}
"""
def __init__(self, group_dict, node_lookup):
# store the original dictionary
self._group_dict = group_dict
# fields required by schema
self._check_required_fields()
self.critical = group_dict['critical']
self.depends_on = group_dict['depends_on']
self.name = group_dict['name']
self.selectors = []
for selector_dict in group_dict['selectors']:
self.selectors.append(GroupNodeSelector(selector_dict))
if not self.selectors:
# no selectors means add an "all" selector
self.selectors.append(GroupNodeSelector({}))
self.success_criteria = SuccessCriteria(
group_dict.get('success_criteria', {})
)
# all groups start as NOT_STARTED
self.__stage = None
self.stage = Stage.NOT_STARTED
# node_lookup function for use with this deployment group
# lookup the full list of nodes for this group's selectors
self.node_lookup = node_lookup
self.full_nodes = self._calculate_all_nodes()
# actionable_nodes is set up based on multi-group interaction.
# Only declaring the field here. Used for deduplicaiton.
self.actionable_nodes = []
@property
def stage(self):
return self.__stage
@stage.setter
def stage(self, stage):
valid_prior = Stage.previous_stage(stage)
pre_change_stage = self.__stage
if self.__stage == stage:
return
elif self.__stage is None and not valid_prior:
self.__stage = stage
elif self.__stage in valid_prior:
self.__stage = stage
else:
raise DeploymentGroupStageError(
"{} is not a valid stage for a group in stage {}".format(
stage, self.__stage
))
LOG.info("Setting group %s with %s -> %s",
self.name,
pre_change_stage,
stage)
def _check_required_fields(self):
"""Checks for required input fields and errors if any are missing"""
for attr in ['critical', 'depends_on', 'name', 'selectors']:
try:
value = self._group_dict[attr]
LOG.debug("Attribute %s has value %s", attr, str(value))
except KeyError:
raise InvalidDeploymentGroupError(
"Attribute '{}' is required as input to create a "
"DeploymentGroup".format(attr))
def _calculate_all_nodes(self):
"""Invoke the node_lookup to retrieve nodes
After construction of the DeploymentGroup, this method is generally
not useful as the results are stored in self.full_nodes
"""
LOG.debug("Beginning lookup of nodes for group %s", self.name)
nodes = self.node_lookup(self.selectors)
if nodes is None:
nodes = []
try:
node_list = list(nodes)
except TypeError:
raise InvalidDeploymentGroupNodeLookupError(
"The node lookup function supplied to the DeploymentGroup "
"is not an iterable"
)
if not all(isinstance(node, str) for node in node_list):
raise InvalidDeploymentGroupNodeLookupError(
"The node lookup function supplied to the DeploymentGroup "
"is not all strings"
)
LOG.info("Group %s selectors have resolved to nodes: %s",
self.name, ", ".join(node_list))
return node_list
def get_failed_success_criteria(self, success_node_list):
"""Check the success criteria for this group.
:param success_node_list: list of nodes that are deemed successful
to be compared to the success criteria
Using the list of all nodes, and the provided success_node_list,
use the SuccessCriteria for this group to see if that list of
successes meets the criteria.
Note that this is not checking for any particular stage of deployment,
simply the comparison of the total list of nodes to the provided list.
Returns a list of failures. An empty list indicates successful
comparison with all criteria.
A good pattern for use of this method is to provide a list of all
nodes being deployed across all groups that are successful for a
given stage of deployment (e.g. all prepared, all deployed).
Calculations are done using set comparisons, so nodes that are not
important for this group will be ignored. It is important *not* to
provide only a list of nodes that were recently acted upon as part of
this group, as deduplication from overlapping groups may cause the
calculations to be skewed and report false failures.
"""
LOG.info('Assessing success criteria for group %s', self.name)
sc = self.success_criteria.get_failed(success_node_list,
self.full_nodes)
if sc:
LOG.info('Group %s failed success criteria', self.name)
else:
LOG.info('Group %s success criteria passed', self.name)
return sc