From ae87cd17146f0a4a6956e8c3b661f2a9e22b20d0 Mon Sep 17 00:00:00 2001 From: Scott Hussey Date: Fri, 17 Nov 2017 11:23:16 -0600 Subject: [PATCH] Update image and chart mgmt NOTE: This has become a monolithic commit to get gate settings/scripts in place for CI - Add Makefile with UCP standard entrypoints - Move Dockerfile into images/drydock per UCP standards - Add values.yaml entries for uWSGI threads and workers - Add environment variables to chart Deployment manifest for uWSGI thread and workers - Add threads and workers specification to uWSGI commandline in entrypoint - Test that the Drydock API is responding - Test that the Drydock API rejects noauth requests - Fix Makefile utility script to work behind a proxy Correct task success voting Some tasks were incorrectly considered partial_success even when no failure occurred. - Network configuration erroneously marked messages as errors - Update result propagation logic to only use the latest retry The deploy_nodes task ended as incomplete due to a missing subtask assignment Also added a node check step to prepare_nodes so that nodes that are already under provisioner control (MaaS) are not IPMI-rebooted. Tangential changes: - added config item to for leadership claim interval - added some debug logging to bootaction_report task - fix tasks list API endpoint to generate valid JSON Improve task concurrency When tasks are started with a scope of multiple nodes, split the main task so each node is managed independently to de-link the progression of nodes. - Split the prepare_nodes task - Begin reducing cyclomatic complexity to allow for better unit testing - Improved tox testing to include coverage by default - Include postgresql integration tests in coverage Closes #73 Change-Id: I600c2a4db74dd42e809bc3ee499fb945ebdf31f6 --- Makefile | 83 +++ charts/drydock/templates/deployment.yaml | 6 + .../templates/tests/test-drydock-api.yaml | 37 ++ .../templates/tests/test-drydock-auth.yaml | 38 ++ charts/drydock/values.yaml | 18 + drydock_provisioner/cli/node/.actions.py.swp | Bin 12288 -> 0 bytes drydock_provisioner/config.py | 8 +- drydock_provisioner/control/tasks.py | 2 +- .../drivers/node/maasdriver/actions/node.py | 16 +- .../drivers/node/maasdriver/driver.py | 2 +- .../drivers/oob/pyghmi_driver/actions/oob.py | 32 +- .../drivers/oob/pyghmi_driver/driver.py | 1 + drydock_provisioner/objects/task.py | 29 +- .../orchestrator/actions/orchestrator.py | 523 ++++++++++++------ .../orchestrator/orchestrator.py | 2 +- drydock_provisioner/statemgmt/db/tables.py | 1 + entrypoint.sh | 15 +- Dockerfile => images/drydock/Dockerfile | 0 .../postgres/test_action_prepare_site.py | 52 ++ .../postgres/test_api_bootaction.py | 12 +- .../postgres/test_api_bootaction_status.py | 10 +- tests/integration/postgres/test_postgres.sh | 16 - tests/postgres/start_postgres.sh | 25 + tests/postgres/stop_postgres.sh | 2 + tests/start_postgres.sh | 10 - tools/drydock_image_run.sh | 86 +++ tools/helm_tk.sh | 65 +++ tox.ini | 26 +- 28 files changed, 866 insertions(+), 251 deletions(-) create mode 100644 Makefile create mode 100644 charts/drydock/templates/tests/test-drydock-api.yaml create mode 100644 charts/drydock/templates/tests/test-drydock-auth.yaml delete mode 100644 drydock_provisioner/cli/node/.actions.py.swp rename Dockerfile => images/drydock/Dockerfile (100%) create mode 100644 tests/integration/postgres/test_action_prepare_site.py delete mode 100755 tests/integration/postgres/test_postgres.sh create mode 100755 tests/postgres/start_postgres.sh create mode 100755 tests/postgres/stop_postgres.sh delete mode 100755 tests/start_postgres.sh create mode 100755 tools/drydock_image_run.sh create mode 100755 tools/helm_tk.sh diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..1a41cc18 --- /dev/null +++ b/Makefile @@ -0,0 +1,83 @@ +# Copyright 2017 AT&T Intellectual Property. All other rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +DRYDOCK_IMAGE_NAME ?= drydock +IMAGE_PREFIX ?= attcomdev +IMAGE_TAG ?= latest +HELM ?= helm +PROXY ?= http://one.proxy.att.com:8080 +USE_PROXY ?= false + +export + +# Build all docker images for this project +.PHONY: images +images: build_drydock + +# Run an image locally and exercise simple tests +.PHONY: run_images +run_images: run_drydock + +# Run tests +.PHONY: tests +tests: coverage_test + +# Run unit and Postgres integration tests in coverage mode +.PHONY: coverage_test +coverage_test: build_drydock + tox -e coverage + +# Run the drydock container and exercise simple tests +.PHONY: run_drydock +run_drydock: build_drydock + tools/drydock_image_run.sh + +# Create tgz of the chart +.PHONY: charts +charts: clean + $(HELM) dep up charts/drydock + $(HELM) package charts/drydock + +# Perform Linting +.PHONY: lint +lint: pep8 helm_lint + +# Dry run templating of chart +.PHONY: dry-run +dry-run: clean + tools/helm_tk.sh $(HELM) + $(HELM) template charts/drydock + +# Make targets intended for use by the primary targets above. + +.PHONY: build_drydock +build_drydock: +ifeq ($(USE_PROXY), true) + docker build -t $(IMAGE_PREFIX)/$(DRYDOCK_IMAGE_NAME):$(IMAGE_TAG) -f images/drydock/Dockerfile . --build-arg http_proxy=$(PROXY) --build-arg https_proxy=$(PROXY) +else + docker build -t $(IMAGE_PREFIX)/$(DRYDOCK_IMAGE_NAME):$(IMAGE_TAG) -f images/drydock/Dockerfile . +endif + +.PHONY: clean +clean: + rm -rf build + +.PHONY: pep8 +pep8: + tox -e pep8 + +.PHONY: helm_lint +helm_lint: clean + tools/helm_tk.sh $(HELM) + $(HELM) lint charts/drydock diff --git a/charts/drydock/templates/deployment.yaml b/charts/drydock/templates/deployment.yaml index bd990c44..b30159a6 100644 --- a/charts/drydock/templates/deployment.yaml +++ b/charts/drydock/templates/deployment.yaml @@ -33,6 +33,8 @@ spec: configmap-etc-hash: {{ tuple "configmap-etc.yaml" . | include "helm-toolkit.utils.hash" }} spec: restartPolicy: Always + affinity: +{{ tuple $envAll "drydock" "api" | include "helm-toolkit.snippets.kubernetes_pod_anti_affinity" | indent 8 }} initContainers: {{ tuple $envAll $dependencies "[]" | include "helm-toolkit.snippets.kubernetes_entrypoint_init_container" | indent 8 }} containers: @@ -43,6 +45,10 @@ spec: secretKeyRef: name: 'maas-api-key' key: 'token' + - name: 'DRYDOCK_API_WORKERS' + value: {{ .Values.conf.uwsgi.workers | default 1 | quote }} + - name: 'DRYDOCK_API_THREADS' + value: {{ .Values.conf.uwsgi.threads | default 1 | quote }} image: {{ .Values.images.tags.drydock }} imagePullPolicy: {{ .Values.images.pull_policy }} {{ tuple $envAll $envAll.Values.pod.resources.api | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }} diff --git a/charts/drydock/templates/tests/test-drydock-api.yaml b/charts/drydock/templates/tests/test-drydock-api.yaml new file mode 100644 index 00000000..0e76f569 --- /dev/null +++ b/charts/drydock/templates/tests/test-drydock-api.yaml @@ -0,0 +1,37 @@ +{{/* +# Copyright (c) 2017 AT&T Intellectual Property. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. */}} +{{/* +Test that the API is up and the health endpoint returns a 2XX code */}} +{{- if .Values.manifests.test_drydock_api }} +--- +apiVersion: v1 +kind: Pod +metadata: + name: "{{ .Release.Name }}-api-test" + annotations: + "helm.sh/hook": "test-success" +spec: + restartPolicy: Never + containers: + - name: "{{ .Release.Name }}-api-test" + env: + - name: 'DRYDOCK_URL' + value: {{ tuple "physicalprovisioner" "internal" "api" . | include "helm-toolkit.endpoints.host_and_port_endpoint_uri_lookup" | quote }} + image: {{ .Values.images.tags.drydock }} + imagePullPolicy: {{ .Values.images.pull_policy }} +{{ tuple . .Values.pod.resources.test | include "helm-toolkit.snippets.kubernetes_resources" | indent 8 }} + command: ["/bin/bash", "-c", "curl -v -X GET --fail ${DRYDOCK_URL}/api/v1.0/health; exit $?"] +... +{{- end }} diff --git a/charts/drydock/templates/tests/test-drydock-auth.yaml b/charts/drydock/templates/tests/test-drydock-auth.yaml new file mode 100644 index 00000000..1c0e4e89 --- /dev/null +++ b/charts/drydock/templates/tests/test-drydock-auth.yaml @@ -0,0 +1,38 @@ +{{/* +# Copyright (c) 2017 AT&T Intellectual Property. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. */}} +{{/* +Test to validate that the Drydock API pod rejects unauthenticated +requests */}} +{{- if .Values.manifests.test_drydock_auth }} +--- +apiVersion: v1 +kind: Pod +metadata: + name: "{{ .Release.Name }}-auth-test" + annotations: + helm.sh/hook: "test-failure" +spec: + restartPolicy: Never + containers: + - name: "{{ .Release.Name }}-auth-test" + env: + - name: 'DRYDOCK_URL' + value: {{ tuple "physicalprovisioner" "internal" "api" . | include "helm-toolkit.endpoints.host_and_port_endpoint_uri_lookup" | quote }} + image: {{ .Values.images.tags.drydock }} + imagePullPolicy: {{ .Values.images.pull_policy }} +{{ tuple . .Values.pod.resources.test | include "helm-toolkit.snippets.kubernetes_resources" | indent 8 }} + command: ["/bin/bash", "-c", "curl -v -X GET --fail ${DRYDOCK_URL}/api/v1.0/tasks; exit $?"] +... +{{- end }} diff --git a/charts/drydock/values.yaml b/charts/drydock/values.yaml index 879a1bf7..7eae0fa4 100644 --- a/charts/drydock/values.yaml +++ b/charts/drydock/values.yaml @@ -46,8 +46,21 @@ pod: rolling_update: max_unavailable: 1 max_surge: 3 + affinity: + anti: + type: + default: preferredDuringSchedulingIgnoredDuringExecution + topologyKey: + default: kubernetes.io/hostname resources: enabled: false + test: + limits: + memory: "128Mi" + cpu: "100m" + requests: + memory: "128Mi" + cpu: "100m" api: limits: memory: "128Mi" @@ -103,6 +116,8 @@ manifests: configmap_bin: true service_drydock: true deployment_drydock: true + test_drydock_api: true + test_drydock_auth: true dependencies: db_init: @@ -220,6 +235,9 @@ database: # Settings for drydock.conf conf: + uwsgi: + threads: 1 + workers: 1 drydock: logging: log_level: 'DEBUG' diff --git a/drydock_provisioner/cli/node/.actions.py.swp b/drydock_provisioner/cli/node/.actions.py.swp deleted file mode 100644 index 0ddf5376c3a92f776acca4b3b0fe3a12c1c4d7c1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12288 zcmeHNOKclO7@pD>ukt7$#6#l8QE(zTb`tuMX@uOQP>bTAI7tZzS)29P9%a3|-C4&C zs30M6Lcoawk1MJ~%LxH@E(mVGffI-dREZN8P6#2snSBP^k!Z^Sv?G1?&dxvo{PX?) z{4-8weR{R}G#xD-Vkmbo_S{ET);dS_v)>OgCOgMwW@l8x&LxnWS{0t(GprNNxe?+o z2ZE(0-z@c9>3iJhmYUvH(=j)uWUt#5epGF^p0g>WaBS|COiPq(r^!o(>En$ox?3Yg z$X;g*WDE>tpy#)yjvZKebarN1wU6ZD2@W(;HuWDH~sWDH~sWDH~sWDH~s z{687+-CgVzOznnndS>{&>u=v-HT%jK$QZ~N$QZ~N$QZ~N$QZ~N$QZ~N$QZ~N$Qbw! zGGH_rTZ7%Z)E@xw|NrRs|7W)|_6Zd$fOmjXz;WPS;D=in`yTiLcprEVco(PxW55Hz{lISE%FWOP zTmW7JOrQcB27bPYu{VGj;JX_cI}eNl_W{4{hwZ?JfB_r__5xS-K_74tcndfStO6^* zgTPOF8M_2r1l|NXzzN_X;2z*^;LkmnBX9|L6?hgn1FQmf0>A8L>@x5p@G)=!SOpFO z`+&>f|8rmqSOr#q#{umZeE(~O!s2ni=h-B=)hZgUXyDQXxBX(9`wica8x!iIA)0eq zAgoL>)w#)ABAcWeU{U1bFbLs>%I0wei=j6J5Ha1`yNm( zd|_DPMc$;A<56VFxGdxgEE*$i7|lgONpsKT-o#|F(NO9djfrHE!8DYfTrL-PQ1!BF zct)3SRYlnCNM)J)4yTos>H;|}{j3Zk-yv}53BJjP=~xKvxy)-?DH*&>%g}~81&mWq zhSOA>nufaZQj8Tw1}e_unW&i;QYs6PrI9P5UihYpzC)v|m6~ELo0ssS;<(66%aD>H z7fit3pvPpHXpGz~OW1yynnD`u7N7GSXR5{fQ*LkF66REw`yHn#5v(9DF6!=1qa`e# zdqMc5UL@QraqZc_>w#qO+^~%BQc0{*f`KA6r&yD&BvLw-Q!F8#UrIO~Tr#;D zCmWnZlhGx+yT2aal%rI5kE*9GXq4+rwxrUT@T1Y=}V35Qt^$saf zQ5*!L&WS2}ntn7z!*!7t#@O!&v!habkmSf1 zsrLHrM#_J2(G;uoKBty6wxGBrdPv->MUv2t1cJcQ?ieMJB72lb%-N6ulKnSm9NbTX zBsP5K4Cz%qlVr3FVI%O3aD}CI&|q<2^By{4CyS0|aw5nwJDkIYB%c}9qX5;c+$wJ*sqk-z}F){eB-mXAHB$i;mYWVe3&0%F^lT{K;DVf>G9K@L1_3?8>c@kzD zw#lgv($bz%T!s45L{+qnQA>eVxu+aOu#YB`qjO=W+~i~0;Ca~RiFU`Q+3A_1R9Svx mnX0zWEsLA#8a_SaIoN0Xts+syvdGcC(>)Z!l6#xVaeo1Ta;zc% diff --git a/drydock_provisioner/config.py b/drydock_provisioner/config.py index 4214cdaf..f47074dd 100644 --- a/drydock_provisioner/config.py +++ b/drydock_provisioner/config.py @@ -54,7 +54,13 @@ class DrydockConfig(object): 'leader_grace_period', default=300, help= - 'How long a leader has to check-in before leaderhsip can be usurped, in seconds' + 'How long a leader has to check-in before leadership can be usurped, in seconds' + ), + cfg.IntOpt( + 'leadership_claim_interval', + default=30, + help= + 'How often will an instance attempt to claim leadership, in seconds' ), ] diff --git a/drydock_provisioner/control/tasks.py b/drydock_provisioner/control/tasks.py index 3441726d..aba3581d 100644 --- a/drydock_provisioner/control/tasks.py +++ b/drydock_provisioner/control/tasks.py @@ -41,7 +41,7 @@ class TasksResource(StatefulResource): """Handler for GET method.""" try: task_model_list = self.state_manager.get_tasks() - task_list = [str(x.to_dict()) for x in task_model_list] + task_list = [x.to_dict() for x in task_model_list] resp.body = json.dumps(task_list) resp.status = falcon.HTTP_200 except Exception as ex: diff --git a/drydock_provisioner/drivers/node/maasdriver/actions/node.py b/drydock_provisioner/drivers/node/maasdriver/actions/node.py index 6b2a07d6..7fa3d8e5 100644 --- a/drydock_provisioner/drivers/node/maasdriver/actions/node.py +++ b/drydock_provisioner/drivers/node/maasdriver/actions/node.py @@ -224,7 +224,7 @@ class CreateNetworkTemplate(BaseMaasAction): # Try to true up MaaS definitions of fabrics/vlans/subnets # with the networks defined in Drydock - design_networks = site_design.networks + design_networks = list() design_links = site_design.network_links fabrics = maas_fabric.Fabrics(self.maas_client) @@ -306,6 +306,7 @@ class CreateNetworkTemplate(BaseMaasAction): # that VLAN tags and subnet attributes are correct for net_name in l.allowed_networks: n = site_design.get_network(net_name) + design_networks.append(n) if n is None: continue @@ -406,7 +407,7 @@ class CreateNetworkTemplate(BaseMaasAction): subnet.resource_id, n.name) self.task.add_status_msg( msg=msg, - error=True, + error=False, ctx=n.name, ctx_type='network') self.logger.info(msg) @@ -430,7 +431,7 @@ class CreateNetworkTemplate(BaseMaasAction): self.logger.debug(msg) self.task.add_status_msg( msg=msg, - error=True, + error=False, ctx=n.name, ctx_type='network') else: @@ -442,6 +443,7 @@ class CreateNetworkTemplate(BaseMaasAction): error=True, ctx=n.name, ctx_type='network') + self.task.failure(focus=n.name) # Check if the routes have a default route subnet.gateway_ip = n.get_default_gateway() @@ -527,6 +529,7 @@ class CreateNetworkTemplate(BaseMaasAction): error=True, ctx=n.name, ctx_type='network') + self.task.failure(focus=n.name) elif dhcp_on and vlan.dhcp_on: self.logger.info("DHCP already enabled for subnet %s" % @@ -558,9 +561,12 @@ class CreateNetworkTemplate(BaseMaasAction): msg = "Could not locate destination network for static route to %s." % route_net self.task.add_status_msg( msg=msg, error=True, ctx=n.name, ctx_type='network') + self.task.failure(focus=n.name) self.logger.info(msg) continue + # now validate that all subnets allowed on a link were created + for n in design_networks: if n.metalabels is not None: # TODO(sh8121att): move metalabels into config @@ -572,13 +578,13 @@ class CreateNetworkTemplate(BaseMaasAction): exists = subnet_list.singleton({'cidr': n.cidr}) if exists is not None: - self.task.success() + self.task.success(focus=n.name) else: msg = "Network %s defined, but not found in MaaS after network config task." % n.name self.logger.error(msg) self.task.add_status_msg( msg=msg, error=True, ctx=n.name, ctx_type='network') - self.task.failure() + self.task.failure(focus=n.name) self.task.set_status(hd_fields.TaskStatus.Complete) self.task.save() diff --git a/drydock_provisioner/drivers/node/maasdriver/driver.py b/drydock_provisioner/drivers/node/maasdriver/driver.py index 6f49c058..1f5713be 100644 --- a/drydock_provisioner/drivers/node/maasdriver/driver.py +++ b/drydock_provisioner/drivers/node/maasdriver/driver.py @@ -155,7 +155,7 @@ class MaasNodeDriver(NodeDriver): nf = self.orchestrator.create_nodefilter_from_nodelist([n]) subtask = self.orchestrator.create_task( design_ref=task.design_ref, - action=hd_fields.OrchestratorAction.IdentifyNode, + action=task.action, node_filter=nf, retry=task.retry) task.register_subtask(subtask) diff --git a/drydock_provisioner/drivers/oob/pyghmi_driver/actions/oob.py b/drydock_provisioner/drivers/oob/pyghmi_driver/actions/oob.py index 2184f4f0..9b31360c 100644 --- a/drydock_provisioner/drivers/oob/pyghmi_driver/actions/oob.py +++ b/drydock_provisioner/drivers/oob/pyghmi_driver/actions/oob.py @@ -71,7 +71,7 @@ class PyghmiBaseAction(BaseAction): ipmi_session = self.get_ipmi_session(node) except (IpmiException, errors.DriverError) as iex: self.logger.error("Error initializing IPMI session for node %s" - % self.node.name) + % node.name) self.logger.debug("IPMI Exception: %s" % str(iex)) self.logger.warning( "IPMI command failed, retrying after 15 seconds...") @@ -168,14 +168,14 @@ class SetNodeBoot(PyghmiBaseAction): ctx=n.name, ctx_type='node') self.logger.debug("%s reports bootdev of network" % n.name) - self.task.success() + self.task.success(focus=n.name) else: self.task.add_status_msg( msg="Unable to set bootdev to PXE.", error=True, ctx=n.name, ctx_type='node') - self.task.failure() + self.task.failure(focus=n.name) self.logger.warning("Unable to set node %s to PXE boot." % (n.name)) @@ -219,7 +219,7 @@ class PowerOffNode(PyghmiBaseAction): ctx_type='node') self.logger.debug( "Node %s reports powerstate of off" % n.name) - self.task.success() + self.task.success(focus=n.name) break time.sleep(10) i = i - 1 @@ -232,7 +232,7 @@ class PowerOffNode(PyghmiBaseAction): ctx=n.name, ctx_type='node') self.logger.error("Giving up on IPMI command to %s" % n.name) - self.task.failure() + self.task.failure(focus=n.name) self.task.set_status(hd_fields.TaskStatus.Complete) self.task.save() @@ -274,7 +274,7 @@ class PowerOnNode(PyghmiBaseAction): error=False, ctx=n.name, ctx_type='node') - self.task.success() + self.task.success(focus=n.name) break time.sleep(10) i = i - 1 @@ -287,7 +287,7 @@ class PowerOnNode(PyghmiBaseAction): ctx=n.name, ctx_type='node') self.logger.error("Giving up on IPMI command to %s" % n.name) - self.task.failure() + self.task.failure(focus=n.name) self.task.set_status(hd_fields.TaskStatus.Complete) self.task.save() @@ -340,7 +340,7 @@ class PowerCycleNode(PyghmiBaseAction): self.logger.warning( "Failed powering down node %s during power cycle task" % n.name) - self.task.failure() + self.task.failure(focus=n.name) break self.logger.debug("Sending set_power = on command to %s" % n.name) @@ -367,7 +367,7 @@ class PowerCycleNode(PyghmiBaseAction): error=False, ctx=n.name, ctx_type='node') - self.task.success() + self.task.success(focus=n.name) else: self.task.add_status_msg( msg="Failed to power up during power cycle.", @@ -377,7 +377,7 @@ class PowerCycleNode(PyghmiBaseAction): self.logger.warning( "Failed powering up node %s during power cycle task" % n.name) - self.task.failure() + self.task.failure(focus=n.name) self.task.set_status(hd_fields.TaskStatus.Complete) self.task.save() @@ -400,14 +400,16 @@ class InterrogateOob(PyghmiBaseAction): try: self.logger.debug( "Interrogating node %s IPMI interface." % n.name) - mci_id = self.exec_ipmi_command(n, Command.get_mci) + powerstate = self.exec_ipmi_command(n, Command.get_power) + if powerstate is None: + raise errors.DriverError() self.task.add_status_msg( - msg="IPMI interface interrogation yielded MCI ID %s" % - mci_id, + msg="IPMI interface interrogation yielded powerstate %s" % + powerstate.get('powerstate'), error=False, ctx=n.name, ctx_type='node') - self.task.success() + self.task.success(focus=n.name) except errors.DriverError: self.logger.debug( "Interrogating node %s IPMI interface failed." % n.name) @@ -416,7 +418,7 @@ class InterrogateOob(PyghmiBaseAction): error=True, ctx=n.name, ctx_type='node') - self.task.failure() + self.task.failure(focus=n.name) self.task.set_status(hd_fields.TaskStatus.Complete) self.task.save() diff --git a/drydock_provisioner/drivers/oob/pyghmi_driver/driver.py b/drydock_provisioner/drivers/oob/pyghmi_driver/driver.py index b35b9509..0c4ae137 100644 --- a/drydock_provisioner/drivers/oob/pyghmi_driver/driver.py +++ b/drydock_provisioner/drivers/oob/pyghmi_driver/driver.py @@ -139,6 +139,7 @@ class PyghmiDriver(oob_driver.OobDriver): uuid.UUID(bytes=t)), exc_info=f.exception()) task.align_result() + task.bubble_results() task.set_status(hd_fields.TaskStatus.Complete) task.save() diff --git a/drydock_provisioner/objects/task.py b/drydock_provisioner/objects/task.py index 75b6128e..96e0408d 100644 --- a/drydock_provisioner/objects/task.py +++ b/drydock_provisioner/objects/task.py @@ -17,6 +17,7 @@ import uuid import json import time import logging +import copy from datetime import datetime @@ -62,7 +63,7 @@ class Task(object): self.retry = retry self.parent_task_id = parent_task_id self.created = datetime.utcnow() - self.node_filter = node_filter + self.node_filter = copy.deepcopy(node_filter) self.created_by = None self.updated = None self.terminated = None @@ -82,7 +83,10 @@ class Task(object): return self.task_id def retry_task(self, max_attempts=None): - """Check if this task should be retried and update attributes if so.""" + """Check if this task should be retried and update attributes if so. + + :param max_attempts: The maximum number of retries for this task + """ if (self.result.status != hd_fields.ActionResult.Success) and (len( self.result.failures) > 0): if not max_attempts or (max_attempts @@ -237,12 +241,16 @@ class Task(object): nf['filter_set'].append( dict(node_names=self.result.successes, filter_type='union')) + return nf + def node_filter_from_failures(self): """Create a node filter from failure entities in this task's result.""" nf = dict(filter_set_type='intersection', filter_set=[]) nf['filter_set'].append( dict(node_names=self.result.failures, filter_type='union')) + return nf + def bubble_results(self, action_filter=None): """Combine successes and failures of subtasks and update this task with the result. @@ -277,23 +285,32 @@ class Task(object): else: self.logger.debug("Skipping subtask due to action filter.") - def align_result(self, action_filter=None): + def align_result(self, action_filter=None, reset_status=True): """Align the result of this task with the combined results of all the subtasks. + If this task has a retry counter > 0, then failure or partial_success results + of a subtask are only counted if the subtask retry counter is equivalent to this + task. + :param action_filter: string action name to filter subtasks on + :param reset_status: Whether to reset the result status of this task before aligning """ + if reset_status: + self.result.status = hd_fields.ActionResult.Incomplete for st in self.statemgr.get_complete_subtasks(self.task_id): if action_filter is None or (action_filter is not None and st.action == action_filter): + self.logger.debug( + "Collecting result status from subtask %s." % str(st.task_id)) if st.get_result() in [ hd_fields.ActionResult.Success, hd_fields.ActionResult.PartialSuccess ]: self.success() - if st.get_result() in [ + if (st.get_result() in [ hd_fields.ActionResult.Failure, hd_fields.ActionResult.PartialSuccess - ]: + ] and (self.retry == 0 or (self.retry == st.retry))): self.failure() else: self.logger.debug("Skipping subtask due to action filter.") @@ -363,6 +380,7 @@ class Task(object): 'request_context': json.dumps(self.request_context.to_dict()) if self.request_context is not None else None, + 'node_filter': self.node_filter, 'action': self.action, 'terminated': @@ -449,6 +467,7 @@ class Task(object): 'terminate', 'updated', 'retry', + 'node_filter', ] for f in simple_fields: diff --git a/drydock_provisioner/orchestrator/actions/orchestrator.py b/drydock_provisioner/orchestrator/actions/orchestrator.py index b72f5a93..a7ba4f4a 100644 --- a/drydock_provisioner/orchestrator/actions/orchestrator.py +++ b/drydock_provisioner/orchestrator/actions/orchestrator.py @@ -58,6 +58,38 @@ class BaseAction(object): return task_futures + def _split_action(self): + """Start a parallel action for each node in scope. + + Start a threaded instance of this action for each node in + scope of this action's task. A subtask will be created for each + action. Returns all a list of concurrent.futures managed + threads running the actions. + + :returns: dictionary of subtask_id.bytes => Future instance + """ + target_nodes = self.orchestrator.get_target_nodes(self.task) + + if len(target_nodes) > 1: + self.logger.info( + "Found multiple target nodes in task %s, splitting..." % str( + self.task.get_id())) + split_tasks = dict() + + with concurrent.futures.ThreadPoolExecutor() as te: + for n in target_nodes: + split_task = self.orchestrator.create_task( + design_ref=self.task.design_ref, + action=hd_fields.OrchestratorAction.PrepareNodes, + node_filter=self.orchestrator. + create_nodefilter_from_nodelist([n])) + self.task.register_subtask(split_task) + action = self.__class__(split_task, self.orchestrator, + self.state_manager) + split_tasks[split_task.get_id().bytes] = te.submit(action.start) + + return split_tasks + def _collect_subtask_futures(self, subtask_futures, timeout=300): """Collect Futures executing on subtasks or timeout. @@ -66,10 +98,10 @@ class BaseAction(object): :param subtask_futures: dictionary of subtask_id.bytes -> Future instance :param timeout: The number of seconds to wait for all Futures to complete + :param bubble: Whether to bubble results from collected subtasks """ finished, timed_out = concurrent.futures.wait( subtask_futures.values(), timeout=timeout) - self.task.align_result() for k, v in subtask_futures.items(): if not v.done(): @@ -86,11 +118,16 @@ class BaseAction(object): "Uncaught excetion in subtask %s future:" % str( uuid.UUID(bytes=k)), exc_info=v.exception()) + st = self.state_manager.get_task(uuid.UUID(bytes=k)) + st.bubble_results() + st.align_result() + st.save() if len(timed_out) > 0: raise errors.CollectSubtaskTimeout( "One or more subtask threads did not finish in %d seconds." % timeout) + self.task.align_result() return def _load_site_design(self): @@ -108,6 +145,26 @@ class BaseAction(object): return site_design + def _get_driver(self, driver_type, subtype=None): + """Locate the correct driver instance based on type and subtype. + + :param driver_type: The type of driver, 'oob', 'node', or 'network' + :param subtype: In cases where multiple drivers can be active, select + one based on subtype + """ + driver = None + if driver_type == 'oob': + if subtype is None: + return None + for d in self.orchestrator.enabled_drivers['oob']: + if d.oob_type_support(subtype): + driver = d + break + else: + driver = self.orchestrator.enabled_drivers[driver_type] + + return driver + class Noop(BaseAction): """Dummy action to allow the full task completion flow without impacts.""" @@ -162,7 +219,10 @@ class ValidateDesign(BaseAction): self.task.design_ref) self.task.merge_status_messages(task_result=status) self.task.set_status(hd_fields.TaskStatus.Complete) - self.task.success() + if status.result.status == hd_fields.ActionResult.Success: + self.task.success() + else: + self.task.failure() self.task.save() except Exception: self.task.set_status(hd_fields.TaskStatus.Complete) @@ -179,7 +239,7 @@ class VerifySite(BaseAction): self.task.set_status(hd_fields.TaskStatus.Running) self.task.save() - node_driver = self.orchestrator.enabled_drivers['node'] + node_driver = self._get_driver('node') if node_driver is None: self.task.set_status(hd_fields.TaskStatus.Complete) @@ -225,7 +285,7 @@ class PrepareSite(BaseAction): self.task.set_status(hd_fields.TaskStatus.Running) self.task.save() - driver = self.orchestrator.enabled_drivers['node'] + driver = self._get_driver('node') if driver is None: self.task.set_status(hd_fields.TaskStatus.Complete) @@ -240,6 +300,19 @@ class PrepareSite(BaseAction): self.task.save() return + self.step_networktemplate(driver) + self.step_usercredentials(driver) + + self.task.align_result() + self.task.set_status(hd_fields.TaskStatus.Complete) + self.task.save() + return + + def step_networktemplate(self, driver): + """Run the CreateNetworkTemplate step of this action. + + :param driver: The driver instance to use for execution. + """ site_network_task = self.orchestrator.create_task( design_ref=self.task.design_ref, action=hd_fields.OrchestratorAction.CreateNetworkTemplate) @@ -259,6 +332,11 @@ class PrepareSite(BaseAction): self.logger.info("Node driver task %s complete" % (site_network_task.get_id())) + def step_usercredentials(self, driver): + """Run the ConfigureUserCredentials step of this action. + + :param driver: The driver instance to use for execution. + """ user_creds_task = self.orchestrator.create_task( design_ref=self.task.design_ref, action=hd_fields.OrchestratorAction.ConfigureUserCredentials) @@ -276,12 +354,7 @@ class PrepareSite(BaseAction): ctx=str(user_creds_task.get_id()), ctx_type='task') self.logger.info("Node driver task %s complete" % - (site_network_task.get_id())) - - self.task.align_result() - self.task.set_status(hd_fields.TaskStatus.Complete) - self.task.save() - return + (user_creds_task.get_id())) class VerifyNodes(BaseAction): @@ -310,11 +383,7 @@ class VerifyNodes(BaseAction): task_futures = dict() for oob_type, oob_nodes in oob_type_partition.items(): - oob_driver = None - for d in self.orchestrator.enabled_drivers['oob']: - if d.oob_type_support(oob_type): - oob_driver = d - break + oob_driver = self._get_driver('oob', oob_type) if oob_driver is None: self.logger.warning( @@ -366,11 +435,7 @@ class PrepareNodes(BaseAction): self.task.set_status(hd_fields.TaskStatus.Running) self.task.save() - # NOTE(sh8121att) Should we attempt to interrogate the node via Node - # Driver to see if it is in a deployed state before we - # start rebooting? Or do we just leverage - # Drydock internal state via site build data (when implemented)? - node_driver = self.orchestrator.enabled_drivers['node'] + node_driver = self._get_driver('node') if node_driver is None: self.task.set_status(hd_fields.TaskStatus.Complete) @@ -388,129 +453,124 @@ class PrepareNodes(BaseAction): design_status, site_design = self.orchestrator.get_effective_site( self.task.design_ref) - target_nodes = self.orchestrator.process_node_filter( - self.task.node_filter, site_design) + target_nodes = self.orchestrator.get_target_nodes(self.task) - oob_type_partition = {} + # Parallelize this task so that node progression is + # not interlinked + if len(target_nodes) > 1: + self.split_action() + else: + check_task_id = self.step_checkexist(node_driver) + node_check_task = self.state_manager.get_task(check_task_id) - for n in target_nodes: - if n.oob_type not in oob_type_partition.keys(): - oob_type_partition[n.oob_type] = [] + # Build the node_filter from the failures of the node_check_task + # as these are the nodes that are unknown to the node provisioner + # and are likely save to be rebooted - oob_type_partition[n.oob_type].append(n) + target_nodes = self.orchestrator.process_node_filter( + node_check_task.node_filter_from_failures(), site_design) - task_futures = dict() - oob_type_filters = dict() + # And log the nodes that were found so they can be addressed - for oob_type, oob_nodes in oob_type_partition.items(): - oob_driver = None - for d in self.orchestrator.enabled_drivers['oob']: - if d.oob_type_support(oob_type): - oob_driver = d - break + for n in node_check_task.result.successes: + self.logger.debug( + "Found node %s in provisioner, skipping OOB management." % + (n)) + self.task.add_status_msg( + msg="Node found in provisioner, skipping OOB management", + error=False, + ctx=n, + ctx_type='node') - if oob_driver is None: - self.logger.warning( - "Node OOB type %s has no enabled driver." % oob_type) - self.task.failure() - for n in oob_nodes: - self.task.add_status_msg( - msg="Node %s OOB type %s is not supported." % - (n.get_name(), oob_type), - error=True, - ctx=n.get_name(), - ctx_type='node') - continue + self.step_oob_set_netboot(target_nodes) - oob_type_filters[ - oob_type] = self.orchestrator.create_nodefilter_from_nodelist( - oob_nodes) + # bubble results from the SetNodeBoot task and + # continue the execution with successful targets - setboot_task = self.orchestrator.create_task( - design_ref=self.task.design_ref, - action=hd_fields.OrchestratorAction.SetNodeBoot, - node_filter=oob_type_filters[oob_type]) - self.task.register_subtask(setboot_task) - - self.logger.info( - "Starting OOB driver task %s to set PXE boot for OOB type %s" % - (setboot_task.get_id(), oob_type)) - task_futures.update( - self._parallelize_subtasks(oob_driver.execute_task, - [setboot_task.get_id()])) - - try: - self._collect_subtask_futures( - task_futures, - timeout=(config.config_mgr.conf.timeouts.drydock_timeout * 60)) - # Get successful nodes and add it to the node filter - # so the next step only happens for successfully configured nodes self.task.bubble_results( action_filter=hd_fields.OrchestratorAction.SetNodeBoot) - for t, f in oob_type_filters.items(): - oob_type_filters[t]['filter_set'].append( - dict( - filter_type='union', - node_names=self.task.result.successes)) - self.logger.debug( - "Collected subtasks for task %s" % str(self.task.get_id())) - except errors.CollectSubtaskTimeout as ex: - self.logger.warning(str(ex)) - task_futures = dict() + target_nodes = self.orchestrator.process_node_filter( + self.task.node_filter_from_successes(), site_design) - for oob_type, oob_nodes in oob_type_partition.items(): - oob_driver = None - for d in self.orchestrator.enabled_drivers['oob']: - if d.oob_type_support(oob_type): - oob_driver = d - break + self.step_oob_powercycle(target_nodes) - if oob_driver is None: - self.logger.warning( - "Node OOB type %s has no enabled driver." % oob_type) - self.task.failure() - for n in oob_nodes: - self.task.add_status_msg( - msg="Node %s OOB type %s is not supported." % - (n.get_name(), oob_type), - error=True, - ctx=n.get_name(), - ctx_type='node') - continue - - cycle_task = self.orchestrator.create_task( - design_ref=self.task.design_ref, - action=hd_fields.OrchestratorAction.PowerCycleNode, - node_filter=oob_type_filters[oob_type]) - self.task.register_subtask(cycle_task) - - self.logger.info( - "Starting OOB driver task %s to power cycle nodes for OOB type %s" - % (cycle_task.get_id(), oob_type)) - - task_futures.update( - self._parallelize_subtasks(oob_driver.execute_task, - [cycle_task.get_id()])) - - try: - self._collect_subtask_futures( - task_futures, - timeout=(config.config_mgr.conf.timeouts.drydock_timeout * 60)) - # Get successful nodes and add it to the node filter - # so the next step only happens for successfully configured nodes + # bubble results from the Powercycle task and + # continue the execution with successful targets self.task.bubble_results( action_filter=hd_fields.OrchestratorAction.PowerCycleNode) - for t, f in oob_type_filters.items(): - oob_type_filters[t]['filter_set'].append( - dict( - filter_type='union', - node_names=self.task.result.successes)) - self.logger.debug( - "Collected subtasks for task %s" % str(self.task.get_id())) - except errors.CollectSubtaskTimeout as ex: - self.logger.warning(str(ex)) + target_nodes = self.orchestrator.process_node_filter( + self.task.node_filter_from_successes(), site_design) + + self.step_node_identify(node_driver, target_nodes) + + # We can only commission nodes that were successfully identified in the provisioner + + self.task.bubble_results( + action_filter=hd_fields.OrchestratorAction.IdentifyNode) + + target_nodes = self.orchestrator.process_node_filter( + self.task.node_filter_from_successes(), site_design) + + self.step_node_configure_hw(node_driver, target_nodes) + + self.task.bubble_results() + self.task.align_result() + self.task.set_status(hd_fields.TaskStatus.Complete) + self.task.save() + return + + def step_node_configure_hw(self, node_driver, node_list): + """Execute the ConfigureHardware step of this action on a list of nodes. + + :param node_driver: driver instance to use for execution + :param node_list: a list of objects.BaremetalNode instances + :return: list of uuid.UUID task ids of the tasks executing this step + """ + if len(node_list) > 0: + self.logger.info( + "Starting hardware configuration task on %d nodes." % + len(node_list)) + + node_commission_task = None + while True: + if node_commission_task is None: + node_commission_task = self.orchestrator.create_task( + design_ref=self.task.design_ref, + action=hd_fields.OrchestratorAction.ConfigureHardware, + node_filter=self.orchestrator. + create_nodefilter_from_nodelist(node_list)) + self.task.register_subtask(node_commission_task) + + self.logger.info( + "Starting node driver task %s to commission nodes." % + (node_commission_task.get_id())) + + node_driver.execute_task(node_commission_task.get_id()) + + node_commission_task = self.state_manager.get_task( + node_commission_task.get_id()) + try: + if not node_commission_task.retry_task(max_attempts=3): + break + except errors.MaxRetriesReached: + self.task.failure() + break + return [node_commission_task.get_id()] + else: + self.logger.warning( + "No nodes successfully identified, skipping commissioning subtask" + ) + return list() + + def step_node_identify(self, node_driver, node_list): + """Execute the IdentifyNode step of this action on a list of nodes. + + :param node_driver: driver instance to use for execution + :param node_list: a list of objects.BaremetalNode instances + :return: list of uuid.UUID task ids of the tasks executing this step + """ # IdentifyNode success will take some time after PowerCycleNode finishes # Retry the operation a few times if it fails before considering it a final failure # Each attempt is a new task which might make the final task tree a bit confusing @@ -521,7 +581,6 @@ class PrepareNodes(BaseAction): self.logger.debug( "Will make max of %d attempts to complete the identify_node task." % max_attempts) - nf = self.task.node_filter_from_successes() node_identify_task = None while True: @@ -529,7 +588,8 @@ class PrepareNodes(BaseAction): node_identify_task = self.orchestrator.create_task( design_ref=self.task.design_ref, action=hd_fields.OrchestratorAction.IdentifyNode, - node_filter=nf) + node_filter=self.orchestrator. + create_nodefilter_from_nodelist(node_list)) self.task.register_subtask(node_identify_task) self.logger.info( @@ -552,46 +612,155 @@ class PrepareNodes(BaseAction): self.task.failure() break - # We can only commission nodes that were successfully identified in the provisioner - if len(node_identify_task.result.successes) > 0: - target_nf = node_identify_task.node_filter_from_successes() + return [node_identify_task.get_id()] + + def step_oob_set_netboot(self, node_list): + """Execute the SetNetBoot step of this action on a list of nodes. + + :param node_list: a list of objects.BaremetalNode instances + :return: list of uuid.UUID task ids of the tasks executing this step + """ + oob_type_partition = {} + + for n in node_list: + if n.oob_type not in oob_type_partition.keys(): + oob_type_partition[n.oob_type] = [] + + oob_type_partition[n.oob_type].append(n) + + task_futures = dict() + + for oob_type, oob_nodes in oob_type_partition.items(): + oob_driver = self._get_driver('oob', oob_type) + + if oob_driver is None: + self.logger.warning( + "Node OOB type %s has no enabled driver." % oob_type) + self.task.failure() + for n in oob_nodes: + self.task.add_status_msg( + msg="Node %s OOB type %s is not supported." % + (n.get_name(), oob_type), + error=True, + ctx=n.get_name(), + ctx_type='node') + continue + + setboot_task = self.orchestrator.create_task( + design_ref=self.task.design_ref, + action=hd_fields.OrchestratorAction.SetNodeBoot, + node_filter=self.orchestrator.create_nodefilter_from_nodelist( + oob_nodes)) + self.task.register_subtask(setboot_task) + self.logger.info( - "Found %s successfully identified nodes, starting commissioning." - % (len(node_identify_task.result.successes))) + "Starting OOB driver task %s to set PXE boot for OOB type %s" % + (setboot_task.get_id(), oob_type)) + task_futures.update( + self._parallelize_subtasks(oob_driver.execute_task, + [setboot_task.get_id()])) - node_commission_task = None - while True: - if node_commission_task is None: - node_commission_task = self.orchestrator.create_task( - design_ref=self.task.design_ref, - action=hd_fields.OrchestratorAction.ConfigureHardware, - node_filter=target_nf) - self.task.register_subtask(node_commission_task) + try: + self._collect_subtask_futures( + task_futures, + timeout=(config.config_mgr.conf.timeouts.drydock_timeout * 60)) + self.logger.debug( + "Collected subtasks for task %s" % str(self.task.get_id())) + except errors.CollectSubtaskTimeout as ex: + self.logger.warning(str(ex)) - self.logger.info( - "Starting node driver task %s to commission nodes." % - (node_commission_task.get_id())) + return [uuid.UUID(bytes=x) for x in task_futures.keys()] - node_driver.execute_task(node_commission_task.get_id()) + def step_oob_powercycle(self, node_list): + """Execute the NodePowerCycle step of this action on a list of nodes. - node_commission_task = self.state_manager.get_task( - node_commission_task.get_id()) - try: - if not node_commission_task.retry_task(max_attempts=3): - break - except errors.MaxRetriesReached: - self.task.failure() - break + :param node_list: a list of objects.BaremetalNode instances + :return: list of uuid.UUID task ids of the tasks executing this step + """ + oob_type_partition = {} - else: - self.logger.warning( - "No nodes successfully identified, skipping commissioning subtask" - ) + for n in node_list: + if n.oob_type not in oob_type_partition.keys(): + oob_type_partition[n.oob_type] = [] - self.task.align_result() - self.task.set_status(hd_fields.TaskStatus.Complete) - self.task.save() - return + oob_type_partition[n.oob_type].append(n) + + task_futures = dict() + + for oob_type, oob_nodes in oob_type_partition.items(): + oob_driver = self._get_driver('oob', oob_type) + + if oob_driver is None: + self.logger.warning( + "Node OOB type %s has no enabled driver." % oob_type) + self.task.failure() + for n in oob_nodes: + self.task.add_status_msg( + msg="Node %s OOB type %s is not supported." % + (n.get_name(), oob_type), + error=True, + ctx=n.get_name(), + ctx_type='node') + continue + + cycle_task = self.orchestrator.create_task( + design_ref=self.task.design_ref, + action=hd_fields.OrchestratorAction.PowerCycleNode, + node_filter=self.orchestrator.create_nodefilter_from_nodelist( + oob_nodes)) + self.task.register_subtask(cycle_task) + + self.logger.info( + "Starting OOB driver task %s to power cycle nodes for OOB type %s" + % (cycle_task.get_id(), oob_type)) + + task_futures.update( + self._parallelize_subtasks(oob_driver.execute_task, + [cycle_task.get_id()])) + + try: + self._collect_subtask_futures( + task_futures, + timeout=(config.config_mgr.conf.timeouts.drydock_timeout * 60)) + self.logger.debug( + "Collected subtasks for task %s" % str(self.task.get_id())) + except errors.CollectSubtaskTimeout as ex: + self.logger.warning(str(ex)) + + return [uuid.UUID(bytes=x) for x in task_futures.keys()] + + def split_action(self): + """Split this action into independent threads per node.""" + action_timeout = config.config_mgr.conf.timeouts.identify_node + \ + config.config_mgr.conf.timeouts.configure_hardware + split_tasks = self._split_action() + self._collect_subtask_futures(split_tasks, timeout=action_timeout * 60) + + def step_checkexist(self, driver): + """Query the driver for node existence to prevent impacting deployed nodes. + + :param driver: driver instance to use for execution + :returns: uuid.UUID task id of the task that executed this step + """ + node_check_task = self.orchestrator.create_task( + design_ref=self.task.design_ref, + action=hd_fields.OrchestratorAction.IdentifyNode, + node_filter=self.task.node_filter) + + self.logger.info("Starting check task %s before rebooting nodes" % + (node_check_task.get_id())) + + task_futures = self._parallelize_subtasks(driver.execute_task, + [node_check_task.get_id()]) + self._collect_subtask_futures( + task_futures, + timeout=(config.config_mgr.conf.timeouts.drydock_timeout * 60)) + + node_check_task = self.state_manager.get_task(node_check_task.get_id()) + node_check_task.bubble_results() + node_check_task.save() + + return node_check_task.get_id() class DeployNodes(BaseAction): @@ -699,6 +868,7 @@ class DeployNodes(BaseAction): action=hd_fields.OrchestratorAction.DeployNode, node_filter=node_platform_task. node_filter_from_successes()) + self.task.register_subtask(node_deploy_task) self.logger.info( "Starting node driver task %s to deploy nodes." % @@ -719,20 +889,23 @@ class DeployNodes(BaseAction): "Unable to configure platform on any nodes, skipping deploy subtask" ) - node_deploy_task.bubble_results( - action_filter=hd_fields.OrchestratorAction.DeployNode) - if len(node_deploy_task.result.successes) > 0: node_bootaction_task = self.orchestrator.create_task( design_ref=self.task.design_ref, action=hd_fields.OrchestratorAction.BootactionReport, node_filter=node_deploy_task.node_filter_from_successes()) + self.task.register_subtask(node_bootaction_task) action = BootactionReport(node_bootaction_task, self.orchestrator, self.state_manager) action.start() + self.task.bubble_results( + action_filter=hd_fields.OrchestratorAction.BootactionReport) + self.task.align_result() + else: + self.task.bubble_results( + action_filter=hd_fields.OrchestratorAction.DeployNode) + self.task.align_result() - self.task.align_result( - action_filter=hd_fields.OrchestratorAction.BootactionReport) self.task.set_status(hd_fields.TaskStatus.Complete) self.task.save() return @@ -745,16 +918,18 @@ class BootactionReport(BaseAction): self.task.set_status(hd_fields.TaskStatus.Running) self.task.save() - poll_start = datetime.utcnow() + poll_start = datetime.datetime.utcnow() still_running = True timeout = datetime.timedelta( minutes=config.config_mgr.conf.timeouts.bootaction_final_status) - running_time = datetime.utcnow() - poll_start + running_time = datetime.datetime.utcnow() - poll_start nodelist = [ n.get_id() for n in self.orchestrator.get_target_nodes(self.task) ] + self.logger.debug( + "Waiting for bootaction response signals to complete.") while running_time < timeout: still_running = False for n in nodelist: @@ -769,8 +944,14 @@ class BootactionReport(BaseAction): still_running = True break if still_running: + self.logger.debug("Still waiting on %d running bootactions." % + len(running_bas)) time.sleep(config.config_mgr.conf.poll_interval) - running_time = datetime.utcnow() + running_time = datetime.datetime.utcnow() - poll_start + else: + break + + self.logger.debug("Signals complete or timeout reached.") for n in nodelist: bas = self.state_manager.get_boot_actions_for_node(n) @@ -815,5 +996,7 @@ class BootactionReport(BaseAction): else: self.task.failure(focus=n) + self.logger.debug("Completed collecting bootaction signals.") + self.task.set_status(hd_fields.TaskStatus.Complete) self.task.save() return diff --git a/drydock_provisioner/orchestrator/orchestrator.py b/drydock_provisioner/orchestrator/orchestrator.py index 89cbe12f..704b4d84 100644 --- a/drydock_provisioner/orchestrator/orchestrator.py +++ b/drydock_provisioner/orchestrator/orchestrator.py @@ -130,7 +130,7 @@ class Orchestrator(object): "Orchestrator %s denied leadership, sleeping to try again." % str(self.orch_id)) # TODO(sh8121att) Make this configurable - time.sleep(300) + time.sleep(config.config_mgr.conf.leadership_claim_interval) else: self.logger.info( "Orchestrator %s successfully claimed leadership, polling for tasks." diff --git a/drydock_provisioner/statemgmt/db/tables.py b/drydock_provisioner/statemgmt/db/tables.py index 19903156..65676fdc 100644 --- a/drydock_provisioner/statemgmt/db/tables.py +++ b/drydock_provisioner/statemgmt/db/tables.py @@ -34,6 +34,7 @@ class Tasks(ExtendTable): Column('updated', DateTime), Column('design_ref', String(128)), Column('request_context', pg.JSON), + Column('node_filter', pg.JSON), Column('action', String(32)), Column('terminated', DateTime), Column('terminated_by', String(16)), diff --git a/entrypoint.sh b/entrypoint.sh index 1451dc8b..20f44fcf 100755 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -3,10 +3,21 @@ set -ex CMD="drydock" PORT=${PORT:-9000} +# Number of uWSGI workers to handle API requests +DRYDOCK_API_WORKERS=${DRYDOCK_API_WORKERS:-"1"} +# Threads per worker +DRYDOCK_API_THREADS=${DRYDOCK_API_THREADS:-"4"} if [ "$1" = 'server' ]; then -# exec uwsgi --http :${PORT} -w drydock_provisioner.drydock --callable drydock --enable-threads -L --pyargv "--config-file /etc/drydock/drydock.conf" - exec uwsgi --http :${PORT} --paste config:/etc/drydock/api-paste.ini --enable-threads -L --pyargv "--config-file /etc/drydock/drydock.conf" +# Run Drydock under uWSGI +# --http - Port to listen for requests on +# --paste - Use DeployPaste for handling requests and use the specified config INI file +# --enable-threads - Enable the Python GIL so that service can be multithreaded +# -L - Turn off uWSGI request logging, rely totally on logging within the application +# --pyargs - Provide some command line arguments to the Python executable +# --threads - Number of threads each uWSGI worker should use for handling requests +# --workers - Number of uWSGI workers/processes for handling requests + exec uwsgi --http :${PORT} --paste config:/etc/drydock/api-paste.ini --enable-threads -L --pyargv "--config-file /etc/drydock/drydock.conf" --threads $DRYDOCK_API_THREADS --workers $DRYDOCK_API_WORKERS fi exec ${CMD} $@ diff --git a/Dockerfile b/images/drydock/Dockerfile similarity index 100% rename from Dockerfile rename to images/drydock/Dockerfile diff --git a/tests/integration/postgres/test_action_prepare_site.py b/tests/integration/postgres/test_action_prepare_site.py new file mode 100644 index 00000000..47fe0254 --- /dev/null +++ b/tests/integration/postgres/test_action_prepare_site.py @@ -0,0 +1,52 @@ +# Copyright 2017 AT&T Intellectual Property. All other rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Generic testing for the orchestrator.""" +import pytest + +import drydock_provisioner.orchestrator.orchestrator as orch +import drydock_provisioner.objects.fields as hd_fields + +from drydock_provisioner.orchestrator.actions.orchestrator import PrepareSite + + +class TestActionPrepareSite(object): + @pytest.mark.skip(reason="test failure fixed in next PS") + def test_preparesite(self, input_files, deckhand_ingester, setup, + drydock_state): + input_file = input_files.join("deckhand_fullsite.yaml") + + design_ref = "file://%s" % str(input_file) + + # Build a dummy object that looks like an oslo_config object + # so the orchestrator is configured w/ Noop drivers + class DummyConf(object): + oob_driver = list() + node_driver = 'drydock_provisioner.drivers.node.driver.NodeDriver' + network_driver = None + + orchestrator = orch.Orchestrator( + enabled_drivers=DummyConf(), + state_manager=drydock_state, + ingester=deckhand_ingester) + + task = orchestrator.create_task( + design_ref=design_ref, + action=hd_fields.OrchestratorAction.PrepareSite) + + action = PrepareSite(task, orchestrator, drydock_state) + action.start() + + task = drydock_state.get_task(task.get_id()) + + assert task.result.status == hd_fields.ActionResult.Success diff --git a/tests/integration/postgres/test_api_bootaction.py b/tests/integration/postgres/test_api_bootaction.py index c24be92e..56a3dd88 100644 --- a/tests/integration/postgres/test_api_bootaction.py +++ b/tests/integration/postgres/test_api_bootaction.py @@ -66,14 +66,14 @@ class TestClass(object): assert result.status == falcon.HTTP_403 @pytest.fixture() - def seed_bootaction(self, blank_state, test_orchestrator, input_files): + def seed_bootaction(self, blank_state, yaml_orchestrator, input_files): """Add a task and boot action to the database for testing.""" input_file = input_files.join("fullsite.yaml") design_ref = "file://%s" % input_file - test_task = test_orchestrator.create_task( + test_task = yaml_orchestrator.create_task( action=hd_fields.OrchestratorAction.Noop, design_ref=design_ref) - id_key = test_orchestrator.create_bootaction_context( + id_key = yaml_orchestrator.create_bootaction_context( 'compute01', test_task) ba_ctx = dict( @@ -83,10 +83,10 @@ class TestClass(object): return ba_ctx @pytest.fixture() - def falcontest(self, drydock_state, test_ingester, test_orchestrator): + def falcontest(self, drydock_state, yaml_ingester, yaml_orchestrator): """Create a test harness for the the Falcon API framework.""" return testing.TestClient( start_api( state_manager=drydock_state, - ingester=test_ingester, - orchestrator=test_orchestrator)) + ingester=yaml_ingester, + orchestrator=yaml_orchestrator)) diff --git a/tests/integration/postgres/test_api_bootaction_status.py b/tests/integration/postgres/test_api_bootaction_status.py index c16e041c..b789d6f4 100644 --- a/tests/integration/postgres/test_api_bootaction_status.py +++ b/tests/integration/postgres/test_api_bootaction_status.py @@ -93,12 +93,12 @@ class TestClass(object): assert result.status == falcon.HTTP_400 @pytest.fixture() - def seed_bootaction_status(self, blank_state, test_orchestrator, + def seed_bootaction_status(self, blank_state, yaml_orchestrator, input_files): """Add a task and boot action to the database for testing.""" input_file = input_files.join("fullsite.yaml") design_ref = "file://%s" % input_file - test_task = test_orchestrator.create_task( + test_task = yaml_orchestrator.create_task( action=hd_fields.OrchestratorAction.Noop, design_ref=design_ref) id_key = os.urandom(32) @@ -115,10 +115,10 @@ class TestClass(object): return ba @pytest.fixture() - def falcontest(self, drydock_state, test_ingester, test_orchestrator): + def falcontest(self, drydock_state, yaml_ingester, yaml_orchestrator): """Create a test harness for the the Falcon API framework.""" return testing.TestClient( start_api( state_manager=drydock_state, - ingester=test_ingester, - orchestrator=test_orchestrator)) + ingester=yaml_ingester, + orchestrator=yaml_orchestrator)) diff --git a/tests/integration/postgres/test_postgres.sh b/tests/integration/postgres/test_postgres.sh deleted file mode 100755 index 217dd0f1..00000000 --- a/tests/integration/postgres/test_postgres.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -sudo docker run --rm -dp 5432:5432 --name 'psql_integration' postgres:9.5 -sleep 15 - -psql -h localhost -c "create user drydock with password 'drydock';" postgres postgres -psql -h localhost -c "create database drydock;" postgres postgres - -sudo docker run --rm -t --net=host -e DRYDOCK_DB_URL="postgresql+psycopg2://drydock:drydock@localhost:5432/drydock" --entrypoint /usr/local/bin/alembic drydock:latest upgrade head - -py.test $1 -RESULT=$? - -sudo docker stop psql_integration - -exit $RESULT diff --git a/tests/postgres/start_postgres.sh b/tests/postgres/start_postgres.sh new file mode 100755 index 00000000..e4c85811 --- /dev/null +++ b/tests/postgres/start_postgres.sh @@ -0,0 +1,25 @@ +#!/bin/bash +set -x +DRYDOCK_IMAGE="${IMAGE_PREFIX}/${DRYDOCK_IMAGE_NAME}:${IMAGE_TAG}" + +if [[ ! -z $(docker ps | grep 'psql_integration') ]] +then + sudo docker stop 'psql_integration' +fi + +DRYDOCK_IMAGE=${DRYDOCK_IMAGE:-"drydock:latest"} + +if [[ ! -z $(docker ps | grep 'psql_integration') ]] +then + sudo docker stop 'psql_integration' +fi + +sudo docker run --rm -dp 5432:5432 --name 'psql_integration' postgres:9.5 +sleep 15 + +docker run --rm --net host postgres:9.5 psql -h localhost -c "create user drydock with password 'drydock';" postgres postgres +docker run --rm --net host postgres:9.5 psql -h localhost -c "create database drydock;" postgres postgres + +export DRYDOCK_DB_URL="postgresql+psycopg2://drydock:drydock@localhost:5432/drydock" + +sudo docker run --rm -t --net=host -e DRYDOCK_DB_URL="$DRYDOCK_DB_URL" --entrypoint /usr/local/bin/alembic $DRYDOCK_IMAGE upgrade head diff --git a/tests/postgres/stop_postgres.sh b/tests/postgres/stop_postgres.sh new file mode 100755 index 00000000..bed5cd27 --- /dev/null +++ b/tests/postgres/stop_postgres.sh @@ -0,0 +1,2 @@ +#!/bin/bash +sudo docker stop 'psql_integration' diff --git a/tests/start_postgres.sh b/tests/start_postgres.sh deleted file mode 100755 index 80eaaf8e..00000000 --- a/tests/start_postgres.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -sudo docker run --rm -dp 5432:5432 --name 'psql_integration' postgres:9.5 -sleep 15 - -psql -h localhost -c "create user drydock with password 'drydock';" postgres postgres -psql -h localhost -c "create database drydock;" postgres postgres - -sudo docker run --rm -t --net=host -e DRYDOCK_DB_URL="postgresql+psycopg2://drydock:drydock@localhost:5432/drydock" --entrypoint /usr/local/bin/alembic drydock:latest upgrade head -export DRYDOCK_DB_URL="postgresql+psycopg2://drydock:drydock@localhost:5432/drydock" diff --git a/tools/drydock_image_run.sh b/tools/drydock_image_run.sh new file mode 100755 index 00000000..b9744c4e --- /dev/null +++ b/tools/drydock_image_run.sh @@ -0,0 +1,86 @@ +#!/bin/bash +set -x + +DRYDOCK_IMAGE="${IMAGE_PREFIX}/${DRYDOCK_IMAGE_NAME}:${IMAGE_TAG}" + +function start_db { + if [[ ! -z $(docker ps | grep 'psql_integration') ]] + then + sudo docker stop 'psql_integration' + fi + + docker run --rm -dp 5432:5432 --name 'psql_integration' postgres:9.5 + sleep 15 + + docker run --rm --net host postgres:9.5 psql -h localhost -c "create user drydock with password 'drydock';" postgres postgres + docker run --rm --net host postgres:9.5 psql -h localhost -c "create database drydock;" postgres postgres +} + +function customize_conf { + default_conffile=$1 + custom_conffile=$2 + + DRYDOCK_DB_URL=$(echo ${DRYDOCK_DB_URL} | sed -e 's/[\/&]/\\&/g') + sed -e "s/^#database_connect_string = /database_connect_string = ${DRYDOCK_DB_URL}/" \ + $default_conffile > $custom_conffile +} + +function generate_conf { + tox -e genconfig > /dev/null + tox -e genpolicy > /dev/null + ETCDIR=$(mktemp -d)/drydock + mkdir -p ${ETCDIR} > /dev/null + cp etc/drydock/noauth-api-paste.ini ${ETCDIR}/api-paste.ini + customize_conf etc/drydock/drydock.conf.sample ${ETCDIR}/drydock.conf + cp etc/drydock/policy.yaml.sample ${ETCDIR}/policy.yaml + echo ${ETCDIR} +} + +function init_db { + docker run --rm -t --net=host \ + -e DRYDOCK_DB_URL="${DRYDOCK_DB_URL}" \ + --entrypoint /usr/local/bin/alembic \ + ${DRYDOCK_IMAGE} \ + upgrade head +} + +function test_drydock { + TMPETC=$1 + docker run \ + -d --name 'drydock_test' --net host \ + -v ${TMPETC}:/etc/drydock \ + ${DRYDOCK_IMAGE} + + RESULT=$(curl --noproxy '*' -i 'http://127.0.0.1:9000/api/v1.0/tasks' | tr '\r' '\n' | head -1) + GOOD="HTTP/1.1 200 OK" + if [[ ${RESULT} == ${GOOD} ]] + then + RC=0 + else + RC=1 + fi + + docker logs drydock_test + return $RC +} + +function cleanup { + TMPDIR=$1 + docker stop psql_integration + docker stop drydock_test + docker rm drydock_test +# rm -rf $TMPDIR +} + +start_db +export DRYDOCK_DB_URL="postgresql+psycopg2://drydock:drydock@localhost:5432/drydock" +init_db + +TMPETC=$(generate_conf) + +test_drydock $TMPETC +RC=$? + +cleanup $TMPETC + +exit $RC diff --git a/tools/helm_tk.sh b/tools/helm_tk.sh new file mode 100755 index 00000000..51c906d8 --- /dev/null +++ b/tools/helm_tk.sh @@ -0,0 +1,65 @@ +#!/bin/bash +# Copyright 2017 AT&T Intellectual Property. All other rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Script to setup helm-toolkit and helm dep up the shipyard chart +# +HELM=$1 +HTK_REPO=${HTK_REPO:-"https://github.com/openstack/openstack-helm"} +HTK_PATH=${HTK_PATH:-""} +DEP_UP_LIST=${DEP_UP_LIST:-"drydock"} + +if [[ ! -z $(echo $http_proxy) ]] +then + export no_proxy=$no_proxy,127.0.0.1 +fi + +set -x + +function helm_serve { + if [[ -d "$HOME/.helm" ]]; then + echo ".helm directory found" + else + ${HELM} init --client-only + fi + if [[ -z $(curl -s 127.0.0.1:8879 | grep 'Helm Repository') ]]; then + ${HELM} serve & > /dev/null + while [[ -z $(curl -s 127.0.0.1:8879 | grep 'Helm Repository') ]]; do + sleep 1 + echo "Waiting for Helm Repository" + done + else + echo "Helm serve already running" + fi + + if ${HELM} repo list | grep -q "^stable" ; then + ${HELM} repo remove stable + fi + + ${HELM} repo add local http://localhost:8879/charts +} + +mkdir -p build +pushd build +git clone --depth 1 $HTK_REPO || true +pushd openstack-helm/$HTK_PATH + +git pull +helm_serve +make helm-toolkit +popd && popd +for c in $DEP_UP_LIST +do + ${HELM} dep up charts/$c +done diff --git a/tox.ini b/tox.ini index 2385f79e..848f417c 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = coverage-unit,pep8,bandit +envlist = pep8,bandit,unit [testenv] setenv = YAMLDIR = {toxinidir}/tests/yaml_samples/ @@ -26,14 +26,6 @@ commands= find {toxinidir}/drydock_provisioner -name '__init__.py' -exec yapf -i --style=pep8 \{\} ; [testenv:unit] -setenv= - PYTHONWARNING=all - YAMLDIR={toxinidir}/tests/yaml_samples/ -commands= - py.test \ - tests/unit/{posargs} - -[testenv:coverage-unit] usedevelop=True setenv= PYTHONWARNING=all @@ -41,18 +33,26 @@ setenv= commands= py.test \ --cov=drydock_provisioner \ - tests/unit/{posargs} + {toxinidir}/tests/unit/{posargs} [testenv:integration] +passenv=DRYDOCK_IMAGE_NAME IMAGE_PREFIX IMAGE_TAG setenv= PYTHONWARNING=all commands= + {toxinidir}/tests/postgres/start_postgres.sh py.test \ - tests/integration/{posargs} + {toxinidir}/tests/integration/postgres/{posargs} -[testenv:postgres] +[testenv:coverage] +usedevelop=True +passenv=DRYDOCK_IMAGE_NAME IMAGE_PREFIX IMAGE_TAG +setenv= + YAMLDIR={toxinidir}/tests/yaml_samples/ commands= - {toxinidir}/tests/integration/postgres/test_postgres.sh {toxinidir}/tests/integration/postgres/{posargs} + {toxinidir}/tests/postgres/start_postgres.sh + py.test --cov=drydock_provisioner \ + {toxinidir}/tests/unit/ {toxinidir}/tests/integration/postgres [testenv:genconfig] commands = oslo-config-generator --config-file=etc/drydock/drydock-config-generator.conf