From ed8107baad7cb27ebf074fe8efbcfec974ae4f92 Mon Sep 17 00:00:00 2001 From: Anthony Lin Date: Fri, 8 Dec 2017 01:56:38 +0000 Subject: [PATCH] Add Backoff time before checking cluster join The current logic checks for nodes that started the join process (based on the snapshot of the environment that was taken by the operator at that point in time). It will not check the state of nodes that it is not aware of, i.e. those that it did not capture initially will not be checked. Hence there is a need to introduce backoff time as it takes a while before all the nodes start to join the Cluster. This is a short term stop gap approach until the Promenade API is ready for consumption Change-Id: I2bdf9c970ecb509fe833fd353e6648a97118d79b --- charts/shipyard/values.yaml | 1 + etc/shipyard/shipyard.conf.sample | 2 ++ shipyard_airflow/conf/config.py | 5 +++++ shipyard_airflow/plugins/check_k8s_node_status.py | 9 ++++++++- shipyard_airflow/plugins/drydock_operators.py | 10 ++++++++++ tests/unit/control/test.conf | 4 +++- 6 files changed, 29 insertions(+), 2 deletions(-) diff --git a/charts/shipyard/values.yaml b/charts/shipyard/values.yaml index 04c87254..20103bc3 100644 --- a/charts/shipyard/values.yaml +++ b/charts/shipyard/values.yaml @@ -328,6 +328,7 @@ conf: prepare_node_task_timeout: 1800 deploy_node_query_interval: 30 deploy_node_task_timeout: 3600 + cluster_join_check_backoff_time: 120 healthcheck: schema: http endpoint: /api/v1.0/health diff --git a/etc/shipyard/shipyard.conf.sample b/etc/shipyard/shipyard.conf.sample index e2e5423a..6a2b3c2d 100644 --- a/etc/shipyard/shipyard.conf.sample +++ b/etc/shipyard/shipyard.conf.sample @@ -82,6 +82,8 @@ # Time out (in seconds) for deploy_node task (integer value) #deploy_node_task_timeout = 3600 +# Backoff time (in seconds) before checking cluster join (integer value) +#cluster_join_check_backoff_time = 120 [healthcheck] diff --git a/shipyard_airflow/conf/config.py b/shipyard_airflow/conf/config.py index 66bcc9cc..a49093c1 100644 --- a/shipyard_airflow/conf/config.py +++ b/shipyard_airflow/conf/config.py @@ -170,6 +170,11 @@ SECTIONS = [ default=3600, help='Time out (in seconds) for deploy_node task' ), + cfg.IntOpt( + 'cluster_join_check_backoff_time', + default=120, + help='Backoff time (in seconds) before checking cluster join' + ), ] ), ConfigSection( diff --git a/shipyard_airflow/plugins/check_k8s_node_status.py b/shipyard_airflow/plugins/check_k8s_node_status.py index b4d2359c..6268c164 100644 --- a/shipyard_airflow/plugins/check_k8s_node_status.py +++ b/shipyard_airflow/plugins/check_k8s_node_status.py @@ -31,8 +31,14 @@ def check_node_status(time_out, interval): Example:: + import time from check_k8s_node_status import check_node_status + # Wait for a while before checking the cluster-join process as + # it takes time for process to be triggered across all nodes + # We will wait for 120 seconds in this example + time.sleep(120) + # Calls function to check that all nodes are in Ready State # Time out in this case is set to 15 mins, the time interval # has been set to 60 seconds @@ -48,8 +54,9 @@ def check_node_status(time_out, interval): # Logs initial state of all nodes in the cluster ret_init = v1.list_node(watch=False) + logging.info("Current state of nodes in Cluster is") + for i in ret_init.items: - logging.info("Current state of nodes in Cluster is") logging.info("%s\t%s\t%s", i.metadata.name, i.status.conditions[-1].status, i.status.conditions[-1].type) diff --git a/shipyard_airflow/plugins/drydock_operators.py b/shipyard_airflow/plugins/drydock_operators.py index 2b5cd4e6..bc1002df 100644 --- a/shipyard_airflow/plugins/drydock_operators.py +++ b/shipyard_airflow/plugins/drydock_operators.py @@ -165,6 +165,16 @@ class DryDockOperator(BaseOperator): self.drydock_action(drydock_client, context, self.action, query_interval, task_timeout) + # Wait for 120 seconds (default value) before checking the cluster + # join process as it takes time for process to be triggered across + # all nodes + cluster_join_check_backoff_time = config.get( + 'drydock', 'cluster_join_check_backoff_time') + logging.info("All nodes deployed in MAAS") + logging.info("Wait for %d seconds before checking node state...", + int(cluster_join_check_backoff_time)) + time.sleep(cluster_join_check_backoff_time) + # Check that cluster join process is completed before declaring # deploy_node as 'completed'. Set time out to 30 minutes and set # polling interval to 30 seconds. diff --git a/tests/unit/control/test.conf b/tests/unit/control/test.conf index ee3309e6..2001ad0e 100644 --- a/tests/unit/control/test.conf +++ b/tests/unit/control/test.conf @@ -8,6 +8,7 @@ web_server = http://airflow-web-int.ucp.svc.cluster.local:8080/ [deckhand] service_type = deckhand [drydock] +cluster_join_check_backoff_time = 120 deploy_node_query_interval = 30 deploy_node_task_timeout = 3600 prepare_node_query_interval = 30 @@ -36,4 +37,5 @@ project_name = service user_domain_name = default username = shipyard [shipyard] -service_type = shipyard \ No newline at end of file +service_type = shipyard +