From 1a28e6b72fe7d330ff2ebde2fc4afacd35bf4a3b Mon Sep 17 00:00:00 2001
From: Drew Walters <drewwalters96@gmail.com>
Date: Mon, 22 Oct 2018 18:52:33 -0500
Subject: [PATCH] wait: Remove test pods from wait

When waiting on resources that share labels with existing test pods,
an upgrade can fail due to a wait operation on the existing test pods.
This change skips wait operations on test resources by filtering them
using Helm hooks.

Change-Id: I465d3429216457ea8d088064cafa74b2b0d9b8cb
---
 armada/const.py                         |  2 ++
 armada/handlers/wait.py                 | 41 +++++++++++++++++++++-
 armada/tests/unit/handlers/test_wait.py | 46 +++++++++++++++++++++++++
 3 files changed, 88 insertions(+), 1 deletion(-)

diff --git a/armada/const.py b/armada/const.py
index e2538812..30d3149b 100644
--- a/armada/const.py
+++ b/armada/const.py
@@ -27,6 +27,8 @@ DEFAULT_CHART_TIMEOUT = 900
 
 # Tiller
 DEFAULT_TILLER_TIMEOUT = 300
+HELM_HOOK_ANNOTATION = 'helm.sh/hook'
+HELM_TEST_HOOKS = ['test-success', 'test-failure']
 STATUS_UNKNOWN = 'UNKNOWN'
 STATUS_DEPLOYED = 'DEPLOYED'
 STATUS_DELETED = 'DELETED'
diff --git a/armada/handlers/wait.py b/armada/handlers/wait.py
index a1312657..7a27945c 100644
--- a/armada/handlers/wait.py
+++ b/armada/handlers/wait.py
@@ -148,6 +148,16 @@ class ResourceWait(ABC):
         '''
         pass
 
+    def include_resource(self, resource):
+        '''
+        Test to include or exclude a resource in a wait operation. This method
+        can be used to exclude resources that should not be included in wait
+        operations (e.g. test pods).
+        :param resource: resource to test
+        :returns: boolean representing test result
+        '''
+        return True
+
     def handle_resource(self, resource):
         resource_name = resource.metadata.name
 
@@ -259,7 +269,9 @@ class ResourceWait(ABC):
 
         resource_list = self.get_resources(**kwargs)
         for resource in resource_list.items:
-            ready[resource.metadata.name] = self.handle_resource(resource)
+            # Only include resources that should be included in wait ops
+            if self.include_resource(resource):
+                ready[resource.metadata.name] = self.handle_resource(resource)
         if not resource_list.items:
             if self.skip_if_none_found:
                 msg = 'Skipping wait, no %s resources found.'
@@ -279,6 +291,11 @@ class ResourceWait(ABC):
             resource = event['object']
             resource_name = resource.metadata.name
             resource_version = resource.metadata.resource_version
+
+            # Skip resources that should be excluded from wait operations
+            if not self.include_resource(resource):
+                continue
+
             msg = ('Watch event: type=%s, name=%s, namespace=%s,'
                    'resource_version=%s')
             LOG.debug(msg, event_type, resource_name,
@@ -330,6 +347,28 @@ class PodWait(ResourceWait):
             resource_type, chart_wait, labels,
             chart_wait.k8s.client.list_namespaced_pod, **kwargs)
 
+    def include_resource(self, resource):
+        pod = resource
+        annotations = pod.metadata.annotations
+
+        # Retrieve pod's Helm test hooks
+        test_hooks = None
+        if annotations:
+            hook_string = annotations.get(const.HELM_HOOK_ANNOTATION)
+            if hook_string:
+                hooks = hook_string.split(',')
+                test_hooks = [h for h in hooks if h in const.HELM_TEST_HOOKS]
+
+        # NOTE(drewwalters96): Test pods may cause wait operations to fail
+        # when old resources remain from previous upgrades/tests. Indicate that
+        # test pods should not be included in wait operations.
+        if test_hooks:
+            LOG.debug('Pod %s will be skipped during wait operations.',
+                      pod.metadata.name)
+            return False
+        else:
+            return True
+
     def is_resource_ready(self, resource):
         pod = resource
         name = pod.metadata.name
diff --git a/armada/tests/unit/handlers/test_wait.py b/armada/tests/unit/handlers/test_wait.py
index ed074f10..cc2db380 100644
--- a/armada/tests/unit/handlers/test_wait.py
+++ b/armada/tests/unit/handlers/test_wait.py
@@ -185,3 +185,49 @@ class ChartWaitTestCase(base.ArmadaTestCase):
         self.assertEqual(2, len(unit.waits))
         for w in unit.waits:
             w.wait.assert_called_once()
+
+
+class PodWaitTestCase(base.ArmadaTestCase):
+
+    def get_unit(self, labels):
+        return wait.PodWait(
+            resource_type='pod', chart_wait=mock.MagicMock(), labels=labels)
+
+    def test_include_resource(self):
+
+        def mock_resource(annotations):
+            resource = mock.Mock()
+            resource.metadata.annotations = annotations
+            return resource
+
+        test_resources = [
+            mock_resource({
+                'key': 'value',
+                'helm.sh/hook': 'test-success'
+            }),
+            mock_resource({
+                'helm.sh/hook': 'test-failure'
+            }),
+            mock_resource({
+                'helm.sh/hook': 'test-success,pre-install'
+            })
+        ]
+        non_test_resources = [
+            mock_resource({
+                'helm.sh/hook': 'pre-install'
+            }),
+            mock_resource({
+                'key': 'value'
+            }),
+            mock_resource({})
+        ]
+
+        unit = self.get_unit({})
+
+        # Validate test resources excluded
+        for resource in test_resources:
+            self.assertFalse(unit.include_resource(resource))
+
+        # Validate other resources included
+        for resource in non_test_resources:
+            self.assertTrue(unit.include_resource(resource))