139 lines
5.8 KiB
Python
139 lines
5.8 KiB
Python
# Copyright 2017 AT&T Intellectual Property. All other rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
import logging
|
|
import time
|
|
|
|
from kubernetes import client
|
|
from kubernetes import config
|
|
|
|
|
|
def check_node_status(time_out, interval, expected_nodes):
|
|
"""This function retrieves the current state of the nodes in the
|
|
Kubernetes cluster. We can use it to check the state of the
|
|
cluster join process (drydock/promenade) and determine if all
|
|
the bare metal nodes have successfully joined the Kubernetes
|
|
cluster.
|
|
|
|
:param time_out: Node should be in Ready state before Time Out
|
|
:param interval: Time interval in which we query node state
|
|
:param expected_nodes: The list of nodes that are expected to be
|
|
present in the check for status
|
|
|
|
Example::
|
|
|
|
import time
|
|
from check_k8s_node_status import check_node_status
|
|
|
|
# Wait for a while before checking the cluster-join process as
|
|
# it takes time for process to be triggered across all nodes
|
|
# We will wait for 120 seconds in this example
|
|
time.sleep(120)
|
|
|
|
# Calls function to check that all nodes are in Ready State
|
|
# Time out in this case is set to 15 mins, the time interval
|
|
# has been set to 60 seconds
|
|
|
|
# The expected nodes are the nodes to be compared against,
|
|
# as there could be nodes that never show up as ready, and those
|
|
# need to be represented in the response
|
|
check_node_status(900, 60, expected_nodes=['a','b','c'])
|
|
"""
|
|
# Initialize Variables - the nodes we are watching for
|
|
if not expected_nodes:
|
|
# if you're not looking for any, don't expect me to look either
|
|
return []
|
|
|
|
not_ready_node_list = list(expected_nodes)
|
|
|
|
# Calculate number of times to execute the 'for' loop
|
|
# Ensure that 'time_out' and 'interval' is passed in as integer
|
|
# The result from the division will be a floating number which
|
|
# We will round off to nearest whole number
|
|
|
|
# no div/0 or negative intervals
|
|
if interval < 1:
|
|
interval = 1
|
|
if time_out < 1:
|
|
time_out = 1
|
|
|
|
end_range = round(int(time_out) / int(interval))
|
|
# end_range + 1 since the first check doesn't have a sleep ahead of it
|
|
for i in range(0, end_range + 1):
|
|
logging.info("Remaining expected nodes to join cluster: [%s]",
|
|
", ".join(not_ready_node_list))
|
|
# Get updated snapshot view of Cluster for each iteration
|
|
ret = _get_all_k8s_node_status()
|
|
|
|
# cautiously prevent crashing out of this code to ensure continued
|
|
# processing.
|
|
if ret is not None and hasattr(ret, 'items'):
|
|
# Check the state of nodes against the remaining expceted nodes
|
|
for j in ret.items:
|
|
# resolve response item fields without letting them break
|
|
# the processing loop.
|
|
try:
|
|
node_name = j.metadata.name
|
|
summary_status = j.status.conditions[-1].status
|
|
summary_message = j.status.conditions[-1].message
|
|
except (AttributeError, IndexError):
|
|
# any issue with the response object, move on to next item
|
|
logging.warning("Malformed node status response object. "
|
|
"Processing continues with the next item",
|
|
exc_info=True)
|
|
continue
|
|
|
|
# only check nodes that we're currently waiting for
|
|
if node_name in not_ready_node_list:
|
|
if summary_status != 'True':
|
|
# Node not ready, print current state of node
|
|
logging.info("Node %s is not ready. Status is: %s",
|
|
node_name, summary_message)
|
|
else:
|
|
# Remove this node from list, it is ready
|
|
not_ready_node_list.remove(node_name)
|
|
logging.info("Node %s is in ready state", node_name)
|
|
|
|
# determine what to do based on the not_ready_node_list
|
|
if not_ready_node_list and i == end_range:
|
|
# There are remining items, and the timeout is elapsed
|
|
logging.info("Timed Out! Nodes [%s] did not reach ready state",
|
|
", ".join(not_ready_node_list))
|
|
break
|
|
elif not not_ready_node_list:
|
|
# Exit loop where there are no more nodes to wait for (all ready)
|
|
logging.info("All expected nodes are in ready state")
|
|
break
|
|
else:
|
|
# There are nodes remaining, and time remining
|
|
# Back off and check again in next iteration
|
|
logging.info("Waiting %d seconds for next check of cluster status",
|
|
int(interval))
|
|
time.sleep(int(interval))
|
|
|
|
# Return the nodes that are not ready.
|
|
return not_ready_node_list
|
|
|
|
|
|
def _get_all_k8s_node_status():
|
|
"""Invoke Kubernetes and return the status response object"""
|
|
# Note that we are using 'in_cluster_config'
|
|
try:
|
|
config.load_incluster_config()
|
|
v1 = client.CoreV1Api()
|
|
return v1.list_node(watch=False)
|
|
except Exception:
|
|
# Log some diagnostics and return None.
|
|
logging.warning("There was an error retrieving the cluster status",
|
|
exc_info=True)
|