From 1ae9a7c4be9660fcdee18f342e1c59b13e95d2eb Mon Sep 17 00:00:00 2001 From: "Bartra, Rick (rb560u)" Date: Wed, 26 Aug 2020 12:54:49 -0400 Subject: [PATCH] Improve logging of Shipyard _execute_task method during failure or time out In the case where a Drydock task either fails or times out, it is not very clear which Drydock task is being referred to. This commit updates the log messages to include the Drydock task-id that has either failed or timed out. Additionally, debug logging is added to log the Drydock task status. The reason for this is because there are situations where a Drydock task is still running, but on the Shipyard side the task has timed out and therefore, Shipyard shows the task (i.e. prepare_nodes or deploy_nodes) as failed. It is good to get a point in time reference to the task state for a couple of reasons: 1) To verify via logging that Shipyard is reporting Drydock tasks (success, failures, partial_successes, etc) accurately 2) Later on if Drydock shows a node as deployed, then the task can be queried and the state can be checked to see if the task was indeed a success. Change-Id: I7050338e2f92ad548e639e2ea4059a520c27e686 --- .../shipyard_airflow/plugins/drydock_nodes.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/bin/shipyard_airflow/shipyard_airflow/plugins/drydock_nodes.py b/src/bin/shipyard_airflow/shipyard_airflow/plugins/drydock_nodes.py index 96fa40c0..3bb65feb 100644 --- a/src/bin/shipyard_airflow/shipyard_airflow/plugins/drydock_nodes.py +++ b/src/bin/shipyard_airflow/shipyard_airflow/plugins/drydock_nodes.py @@ -245,6 +245,7 @@ class DrydockNodesOperator(DrydockBaseOperator): """ self.create_task(task_name) result = QueryTaskResult(self.drydock_task_id, task_name) + task_dict = self.get_task_dict(self.drydock_task_id) try: self.query_task(interval, timeout) @@ -252,17 +253,25 @@ class DrydockNodesOperator(DrydockBaseOperator): # Task failure may be successful enough based on success criteria. # This should not halt the overall flow of this workflow step. LOG.warn( - "Task %s has failed. Logs contain details of the failure. " - "Some nodes may be succesful, processing continues", task_name + "Task %s with Drydock task-id: %s has failed. Logs contain " + "details of the failure. Some nodes may be succesful, " + "processing continues", task_name, self.drydock_task_id ) + LOG.debug( + "Current state of failed Drydock task %s: %s", + self.drydock_task_id, task_dict) except DrydockTaskTimeoutException: # Task timeout may be successful enough based on success criteria. # This should not halt the overall flow of this workflow step. LOG.warn( - "Task %s has timed out after %s seconds. Logs contain details " - "of the failure. Some nodes may be succesful, processing " - "continues", task_name, timeout + "Task %s with Drydock task-id: %s has timed out after %s " + "seconds. Logs contain details of the failure. Some nodes may " + "be succesful, processing continues", task_name, + self.drydock_task_id, timeout ) + LOG.debug( + "Current state of timed out Drydock task %s: %s", + self.drydock_task_id, task_dict) # Other AirflowExceptions will fail the whole task - let them do this. # find successes