Tolerate 404 Response for non existing resource

Armada raises an exception for the 404 response from the Kubernetes
API if the resource was not found. However, some resources, like
jobs, are "volatile" they may disappear after some minutes. The
armada may send a DELETE request to the Kubernetes API after the job
finished its lifecycle. In this situation the job will not be found
in the cluster and the armada will break some application reapply due
to this behavior. Therefore, a possible solution may be check, at
least for the job resource type, whether the response returned from
the Kubernetes API is 404, if it is then just log the API response
error reason and continue trying to delete the remaining jobs in the
list, otherwise we raise the exception.

Partial-Bug: 1948850

Signed-off-by: Iago Estrela <IagoFilipe.EstrelaBarros@windriver.com>
Change-Id: I12ee8cef869e4443920d7f2e1899556e8940a977
This commit is contained in:
Iago Estrela 2021-10-20 09:16:51 -03:00
parent b37417253f
commit ddbdd7256c
1 changed files with 96 additions and 45 deletions

View File

@ -26,6 +26,7 @@ from hapi.services.tiller_pb2 import UninstallReleaseRequest
from hapi.services.tiller_pb2 import UpdateReleaseRequest from hapi.services.tiller_pb2 import UpdateReleaseRequest
from oslo_config import cfg from oslo_config import cfg
from oslo_log import log as logging from oslo_log import log as logging
from kubernetes.client.rest import ApiException
import yaml import yaml
from armada import const from armada import const
@ -314,17 +315,17 @@ class Tiller(object):
for action in actions.get('update', []): for action in actions.get('update', []):
name = action.get('name') name = action.get('name')
LOG.info('Updating %s ', name) LOG.info('Updating %s ', name)
action_type = action.get('type') resource_type = action.get('type')
labels = action.get('labels') labels = action.get('labels')
self.rolling_upgrade_pod_deployment( self.rolling_upgrade_pod_deployment(
name, release_name, namespace, labels, action_type, chart, name, release_name, namespace, labels, resource_type,
disable_hooks, values, timeout) chart, disable_hooks, values, timeout)
except Exception: except Exception:
LOG.exception( LOG.exception(
"Pre-action failure: could not perform rolling upgrade for " "Pre-action failure: could not perform rolling upgrade for "
"%(res_type)s %(res_name)s.", { "%(res_type)s %(res_name)s.", {
'res_type': action_type, 'res_type': resource_type,
'res_name': name 'res_name': name
}) })
raise ex.PreUpdateJobDeleteException(name, namespace) raise ex.PreUpdateJobDeleteException(name, namespace)
@ -332,16 +333,16 @@ class Tiller(object):
try: try:
for action in actions.get('delete', []): for action in actions.get('delete', []):
name = action.get('name') name = action.get('name')
action_type = action.get('type') resource_type = action.get('type')
labels = action.get('labels', None) labels = action.get('labels', None)
self.delete_resources( self.delete_resources(
action_type, labels, namespace, timeout=timeout) resource_type, labels, namespace, timeout=timeout)
except Exception: except Exception:
LOG.exception( LOG.exception(
"Pre-action failure: could not delete %(res_type)s " "Pre-action failure: could not delete %(res_type)s "
"%(res_name)s.", { "%(res_name)s.", {
'res_type': action_type, 'res_type': resource_type,
'res_name': name 'res_name': name
}) })
raise ex.PreUpdateJobDeleteException(name, namespace) raise ex.PreUpdateJobDeleteException(name, namespace)
@ -617,13 +618,80 @@ class Tiller(object):
status = self.get_release_status(release) status = self.get_release_status(release)
raise ex.ReleaseException(release, status, 'Delete') raise ex.ReleaseException(release, status, 'Delete')
def _delete_jobs(self, jobs, resource_labels, namespace, timeout):
for jb in jobs.items:
try:
jb_name = jb.metadata.name
LOG.info(
"Deleting job: %s in namespace: %s", jb_name, namespace)
self.k8s.delete_job_action(jb_name, namespace, timeout=timeout)
except ApiException as err:
if err.status != 404:
raise ApiException
LOG.warn(
"No jobs found with labels=%s namespace=%s",
resource_labels, namespace)
def _delete_cronjobs(
self,
cronjobs,
resource_labels,
namespace,
timeout,
implied_cronjob=False):
for jb in cronjobs.items:
try:
jb_name = jb.metadata.name
# TODO: Remove when v1 doc support is removed.
if implied_cronjob:
LOG.warn(
"Deleting cronjobs via `type: job` is "
"deprecated, use `type: cronjob` instead")
LOG.info(
"Deleting cronjob %s in namespace: %s", jb_name, namespace)
self.k8s.delete_cron_job_action(
jb_name, namespace, timeout=timeout)
except ApiException as err:
if err.status != 404:
raise ApiException
LOG.warn(
"No cronjobs found with labels=%s namespace=%s",
resource_labels, namespace)
def _delete_pods(
self, release_pods, resource_labels, namespace, timeout,
wait=False):
for pod in release_pods.items:
try:
pod_name = pod.metadata.name
LOG.info(
"Deleting pod %s in namespace: %s", pod_name, namespace)
self.k8s.delete_pod_action(
pod_name, namespace, timeout=timeout)
if wait:
self.k8s.wait_for_pod_redeployment(pod_name, namespace)
except ApiException as err:
if err.status != 404:
raise ApiException
LOG.warn(
"No pods found with labels=%s namespace=%s",
resource_labels, namespace)
def _job_implies_cronjob(self, resource_type):
chart = get_current_chart()
schema_info = schema.get_schema_info(chart['schema'])
job_implies_cronjob = schema_info.version < 2
return resource_type == 'job' and job_implies_cronjob
def delete_resources( def delete_resources(
self, self,
resource_type, resource_type,
resource_labels, resource_labels,
namespace, namespace,
wait=False, wait=False,
timeout=const.DEFAULT_TILLER_TIMEOUT): timeout=const.DEFAULT_TILLER_TIMEOUT,
implied_job_check=True):
''' '''
Delete resources matching provided resource type, labels, and Delete resources matching provided resource type, labels, and
namespace. namespace.
@ -643,50 +711,32 @@ class Tiller(object):
handled = False handled = False
if resource_type == 'job': if resource_type == 'job':
get_jobs = self.k8s.get_namespace_job( jobs = self.k8s.get_namespace_job(
namespace, label_selector=label_selector) namespace, label_selector=label_selector)
for jb in get_jobs.items: self._delete_jobs(jobs, resource_labels, namespace, timeout)
jb_name = jb.metadata.name
LOG.info(
"Deleting job: %s in namespace: %s", jb_name, namespace)
self.k8s.delete_job_action(jb_name, namespace, timeout=timeout)
handled = True handled = True
# TODO: Remove when v1 doc support is removed. # TODO: Remove when v1 doc support is removed.
chart = get_current_chart() implied_cronjob = False
schema_info = schema.get_schema_info(chart['schema']) if implied_job_check:
job_implies_cronjob = schema_info.version < 2 implied_cronjob = self._job_implies_cronjob(resource_type)
implied_cronjob = resource_type == 'job' and job_implies_cronjob
if resource_type == 'cronjob' or implied_cronjob: if resource_type == 'cronjob' or implied_cronjob:
get_jobs = self.k8s.get_namespace_cron_job( cronjobs = self.k8s.get_namespace_cron_job(
namespace, label_selector=label_selector) namespace, label_selector=label_selector)
for jb in get_jobs.items: self._delete_cronjobs(
jb_name = jb.metadata.name cronjobs,
resource_labels,
# TODO: Remove when v1 doc support is removed. namespace,
if implied_cronjob: timeout,
LOG.warn( implied_cronjob=implied_cronjob)
"Deleting cronjobs via `type: job` is "
"deprecated, use `type: cronjob` instead")
LOG.info(
"Deleting cronjob %s in namespace: %s", jb_name, namespace)
self.k8s.delete_cron_job_action(jb_name, namespace)
handled = True handled = True
if resource_type == 'pod': if resource_type == 'pod':
release_pods = self.k8s.get_namespace_pod( release_pods = self.k8s.get_namespace_pod(
namespace, label_selector=label_selector) namespace, label_selector=label_selector)
for pod in release_pods.items: self._delete_pods(
pod_name = pod.metadata.name release_pods, resource_labels, namespace, timeout, wait=wait)
LOG.info(
"Deleting pod %s in namespace: %s", pod_name, namespace)
self.k8s.delete_pod_action(pod_name, namespace)
if wait:
self.k8s.wait_for_pod_redeployment(pod_name, namespace)
handled = True handled = True
if not handled: if not handled:
@ -700,7 +750,7 @@ class Tiller(object):
release_name, release_name,
namespace, namespace,
resource_labels, resource_labels,
action_type, resource_type,
chart, chart,
disable_hooks, disable_hooks,
values, values,
@ -709,9 +759,9 @@ class Tiller(object):
update statefulsets (daemon, stateful) update statefulsets (daemon, stateful)
''' '''
if action_type == 'daemonset': if resource_type == 'daemonset':
LOG.info('Updating: %s', action_type) LOG.info('Updating: %s', resource_type)
label_selector = '' label_selector = ''
@ -726,7 +776,7 @@ class Tiller(object):
ds_labels = ds.metadata.labels ds_labels = ds.metadata.labels
if ds_name == name: if ds_name == name:
LOG.info( LOG.info(
"Deleting %s : %s in %s", action_type, ds_name, "Deleting %s : %s in %s", resource_type, ds_name,
namespace) namespace)
self.k8s.delete_daemon_action(ds_name, namespace) self.k8s.delete_daemon_action(ds_name, namespace)
@ -750,7 +800,8 @@ class Tiller(object):
timeout=timeout) timeout=timeout)
else: else:
LOG.error("Unable to exectue name: % type: %s", name, action_type) LOG.error(
"Unable to execute name: % type: %s", name, resource_type)
def rollback_release( def rollback_release(
self, self,