From 9aadc14777196043336f5a72708f64496c2b78b8 Mon Sep 17 00:00:00 2001 From: "DeJaeger, Darren (dd118r)" Date: Thu, 13 May 2021 14:58:32 -0400 Subject: [PATCH] Armada improved logging, uplift dependency This PS: 1) Looks to improve specific logging in Armada, so that it's easier to debug deployment related issues 2) Uplifts the k8s Python dependency to 12.0.0 3) Enforces 'watch' timeouts more strictly, as the call to the Kubernetes Python watch function seemed unreliable. 4) Adds a field selector to the 'watch' stream to look for the DELETE action to have been completed on the specific pod/job/cronjob, rather than looking across the whole namespace or via labels. This will narrow what the watch is looking at, making the logs less busy. Change-Id: I1952b0db32fb0b56ffffcddeae0532beb5a27b67 --- armada/handlers/k8s.py | 48 ++++++++++++++++++++++++--------------- armada/handlers/tiller.py | 6 ++--- requirements.txt | 2 +- 3 files changed, 34 insertions(+), 22 deletions(-) diff --git a/armada/handlers/k8s.py b/armada/handlers/k8s.py index 727ecb98..31b00c70 100644 --- a/armada/handlers/k8s.py +++ b/armada/handlers/k8s.py @@ -13,6 +13,7 @@ # limitations under the License. import re +import time from kubernetes import client from kubernetes import config @@ -151,34 +152,45 @@ class K8s(object): timeout = self._check_timeout(timeout) LOG.debug( - 'Watching to delete %s %s, Wait timeout=%s', - object_type_description, name, timeout) + 'Watching to delete %s: %s in namespace=%s (wait timeout=%s)', + object_type_description, name, namespace, timeout) body = client.V1DeleteOptions( propagation_policy=propagation_policy) w = watch.Watch() issue_delete = True found_events = False - for event in w.stream(list_func, namespace=namespace, - timeout_seconds=timeout): - if issue_delete: - delete_func(name=name, namespace=namespace, body=body) - issue_delete = False - event_type = event['type'].upper() - item_name = event['object'].metadata.name - LOG.debug('Watch event %s on %s', event_type, item_name) + deadline = round(time.time() + timeout) + while timeout > 0: + for event in w.stream( + list_func, namespace=namespace, + field_selector='metadata.name={}'.format(name), + timeout_seconds=timeout): + if issue_delete: + delete_func(name=name, namespace=namespace, body=body) + issue_delete = False - if item_name == name: - found_events = True - if event_type == 'DELETED': - LOG.info( - 'Successfully deleted %s %s', - object_type_description, item_name) - return + event_type = event['type'].upper() + item = event['object'] + item_name = item.metadata.name + LOG.debug( + 'Watch event seen: type=%s, name=%s, ' + 'namespace=%s (waiting on %s: %s)', event_type, + item_name, namespace, object_type_description, name) + + if item_name == name: + found_events = True + if event_type == 'DELETED': + LOG.info( + 'Successfully deleted %s: %s in namespace=%s', + object_type_description, item_name, namespace) + return + + timeout = round(deadline - time.time()) if not found_events: LOG.warn( - 'Saw no delete events for %s %s in namespace=%s', + 'Saw no events for %s: %s in namespace=%s', object_type_description, name, namespace) err_msg = ( diff --git a/armada/handlers/tiller.py b/armada/handlers/tiller.py index 77fa4a3c..c685bbdd 100644 --- a/armada/handlers/tiller.py +++ b/armada/handlers/tiller.py @@ -638,8 +638,8 @@ class Tiller(object): if resource_labels is not None: label_selector = label_selectors(resource_labels) LOG.debug( - "Deleting resources in namespace %s matching " - "selectors (%s).", namespace, label_selector) + "Deleting resources in namespace: %s, matching " + "selectors: %s (timeout=%s).", namespace, label_selector, timeout) handled = False if resource_type == 'job': @@ -649,7 +649,7 @@ class Tiller(object): jb_name = jb.metadata.name LOG.info( - "Deleting job %s in namespace: %s", jb_name, namespace) + "Deleting job: %s in namespace: %s", jb_name, namespace) self.k8s.delete_job_action(jb_name, namespace, timeout=timeout) handled = True diff --git a/requirements.txt b/requirements.txt index 0527ddf3..db84d2c4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ jsonschema>=3.0.1<4 keystoneauth1>=3.18.0 keystonemiddleware==5.3.0 kombu<4.7,>=4.6.10 -kubernetes>=11.0.0 +kubernetes>=12.0.0 Paste>=2.0.3 PasteDeploy>=1.5.2 protobuf>=3.4.0