Updated resiliency gate

Updated resiliency gate script to consistently pass all gate stages,
using ubuntu bionic image for node deployment.

- Updated developer-onbording.rst with information on how to configure
  and run the resilency gate behind corporate proxy.
- Updated the gate scripts to use the proxy configuration.
- Updated up.sh to pull the hyperkube image as cache, to speed up and
  stabalize the initial kublet deployment of kubernetes cluster services.
- Updated and added sleeps and retries in some of gate stages and
  scripts to avoid gate failures due to transient environment issues.
- Updated the ubuntu base image for node deployments from xenial to\
  bionic base image.
- Added code in treadown-nodes stage to manually remove the etcd
  members: kubernetes and calico, since they still remain listed as
  etcd members on genesis node, even after genesis is torn down.

Change-Id: Ia11d66ab30ac7a07626d4f1d02a6da48155f862d
This commit is contained in:
Ahmad Mahmoudi 2020-02-19 02:04:49 +00:00
parent cb4ae15eb1
commit 9f42b502f7
11 changed files with 91 additions and 20 deletions

View File

@ -43,6 +43,25 @@ debug it, e.g.:
./tools/g2/bin/ssh.sh n0
Running Resilency Tests Behind Corporate Proxy
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
If your development environment is behind a corporate proxy, you will need to
update following files to add your envrionment's proxy information, dns, or
possibly your internal ntp servers, in order to deploy airship:
* `charts/coredns/values.yaml`: Update the upstream coredns nameserver IPs
to your internal DNS addresses.
* `examples/basic/KubernetesNetwork.yaml`: Since resilency manifest uses
the examples/basic environment configuration, you will need to Update
the kubernetes network configuration in this folder. Update the upstream
nameserver IPs to your internal DNS addresses. Add the http(s) proxy URL
and additional_no_proxy list. Also, if your enviornment requires that,
update the ntp server list to your internal ntp server addresses for
more reliable time sync.
* `tools/g2/templates/network-config.sub`: Update the upstream nameserver
IPs to your internal DNS addresses.
Bootstrapping
-------------

View File

@ -74,6 +74,14 @@ export http_proxy={{ config['KubernetesNetwork:proxy.url'] | default('', true) }
export https_proxy={{ config['KubernetesNetwork:proxy.url'] | default('', true) }}
export no_proxy={{ config.get(kind='KubernetesNetwork') | fill_no_proxy }}
# Configure apt proxy
if [[ -n "${http_proxy}" ]]; then
log "Configuring Apt Proxy"
cat << EOF | sudo tee /etc/apt/apt.conf.d/50proxyconf
Acquire::https::proxy "${https_proxy}";
Acquire::http::proxy "${http_proxy}";
EOF
fi
# Install system packages
#
@ -139,5 +147,13 @@ fi
if systemctl -q is-enabled containerd > /dev/null 2>&1; then
systemctl restart containerd || true
fi
# Pull the hyperkube image prior to restarting kubelet, this is
# needed for more reliable image pull in an environment with slow
# network connectivity to avoid image pull timeouts and retries by
# kubelet.
# The || true is added to let the deployment continue, evenif the
# $IMAGE_HYPERKUBE is not defined in the environment, and the image
# pull doesn't happen.
docker image pull "${IMAGE_HYPERKUBE}" || true
systemctl enable kubelet
systemctl restart kubelet

View File

@ -222,7 +222,7 @@ function validate_kubectl_logs {
NAMESPACE=default
POD_NAME=log-test-${NODE}-$(date +%s)
cat <<EOPOD | kubectl --namespace $NAMESPACE apply -f -
cat <<EOPOD | kubectl --namespace $NAMESPACE --timeout 100s apply -f -
---
apiVersion: v1
kind: Pod
@ -244,6 +244,7 @@ EOPOD
wait_for_node_ready $NODE 300
wait_for_pod_termination $NAMESPACE $POD_NAME
sleep 5
ACTUAL_LOGS=$(kubectl --namespace $NAMESPACE logs $POD_NAME)
if [ "x$ACTUAL_LOGS" != "xEXPECTED RESULT" ]; then
log Got unexpected logs:

View File

@ -1,6 +1,6 @@
export TEMP_DIR=${TEMP_DIR:-$(mktemp -d)}
export BASE_IMAGE_SIZE=${BASE_IMAGE_SIZE:-68719476736}
export BASE_IMAGE_URL=${BASE_IMAGE_URL:-https://cloud-images.ubuntu.com/releases/16.04/release/ubuntu-16.04-server-cloudimg-amd64-disk1.img}
export BASE_IMAGE_SIZE=${BASE_IMAGE_SIZE:-344784896}
export BASE_IMAGE_URL=${BASE_IMAGE_URL:-https://cloud-images.ubuntu.com/releases/bionic/release/ubuntu-18.04-server-cloudimg-amd64.img}
export IMAGE_PROMENADE=${IMAGE_PROMENADE:-quay.io/airshipit/promenade:master}
export IMAGE_PROMENADE_DISTRO=${IMAGE_PROMENADE_DISTRO:-ubuntu_bionic}
export IMAGE_HYPERKUBE=${IMAGE_HYPERKUBE:-gcr.io/google_containers/hyperkube-amd64:v1.17.3}

View File

@ -14,3 +14,17 @@ etcdctl_member_list() {
etcdctl_cmd "${CLUSTER}" "${VM}" member list -w json | jq -r '.members[].name' | sort
}
etcdctl_member_remove() {
CLUSTER=${1}
VM=${2}
NODE=${3}
shift 3
MEMBER_ID=$(etcdctl_cmd $CLUSTER ${VM} member list | awk -F', ' "/${NODE}/ "'{ print $1}')
if [[ -n $MEMBER_ID ]] ; then
etcdctl_cmd "${CLUSTER}" "${VM}" member remove "$MEMBER_ID"
else
log No members found in cluster "$CLUSTER" for node "$NODE"
fi
}

View File

@ -13,13 +13,24 @@ validate_etcd_membership() {
EXPECTED_MEMBERS="${*}"
# NOTE(mark-burnett): Wait a moment for disks in test environment to settle.
sleep 10
sleep 60
log Validating "${CLUSTER}" etcd membership via "${VM}" for members: "${EXPECTED_MEMBERS[@]}"
FOUND_MEMBERS=$(etcdctl_member_list "${CLUSTER}" "${VM}" | tr '\n' ' ' | sed 's/ $//')
if [[ "x${EXPECTED_MEMBERS}" != "x${FOUND_MEMBERS}" ]]; then
log Etcd membership check failed for cluster "${CLUSTER}"
local retries=25
for ((n=0;n<=$retries;n++)); do
FOUND_MEMBERS=$(etcdctl_member_list "${CLUSTER}" "${VM}" | tr '\n' ' ' | sed 's/ $//')
log "Found \"${FOUND_MEMBERS}\", expected \"${EXPECTED_MEMBERS}\""
exit 1
fi
if [[ "x${EXPECTED_MEMBERS}" != "x${FOUND_MEMBERS}" ]]; then
log Etcd membership check failed for cluster "${CLUSTER}" on attempt "$n".
if [[ "$n" == "$retries" ]]; then
log Etcd membership check failed for cluster "${CLUSTER}" after "$n" retries. Exiting.
exit 1
fi
sleep 30
else
log Etcd membership check succeeded for cluster "${CLUSTER}" on attempt "${n}"
break
fi
done
}

View File

@ -120,6 +120,8 @@
"name": "Teardown Genesis",
"script": "teardown-nodes.sh",
"arguments": [
"-e", "kubernetes",
"-e", "calico",
"-v", "n1",
"-n", "n0",
"-r"
@ -160,7 +162,7 @@
}
],
"vm": {
"memory": 3072,
"memory": 4096,
"names": [
"n0",
"n1",

View File

@ -7,7 +7,9 @@ source "${GATE_UTILS}"
rsync_cmd "${TEMP_DIR}/scripts"/*genesis* "${GENESIS_NAME}:/root/promenade/"
set -o pipefail
ssh_cmd "${GENESIS_NAME}" env "PROMENADE_ENCRYPTION_KEY=${PROMENADE_ENCRYPTION_KEY}" /root/promenade/genesis.sh 2>&1 | tee -a "${LOG_FILE}"
ssh_cmd "${GENESIS_NAME}" env "IMAGE_HYPERKUBE=${IMAGE_HYPERKUBE}" \
env "PROMENADE_ENCRYPTION_KEY=${PROMENADE_ENCRYPTION_KEY}" \
/root/promenade/genesis.sh 2>&1 | tee -a "${LOG_FILE}"
ssh_cmd "${GENESIS_NAME}" /root/promenade/validate-genesis.sh 2>&1 | tee -a "${LOG_FILE}"
set +o pipefail

View File

@ -52,7 +52,7 @@ mkdir -p "${SCRIPT_DIR}"
for NAME in "${NODES[@]}"; do
log Building join script for node "${NAME}"
CURL_ARGS=("--fail" "--max-time" "300" "--retry" "16" "--retry-delay" "15")
CURL_ARGS=("-v" "--max-time" "600" "--retry" "20" "--retry-delay" "15" "--connect-timeout" "30" "--progress-bar")
if [[ $GET_KEYSTONE_TOKEN == 1 ]]; then
TOKEN="$(os_ks_get_token "${VIA}")"
if [[ -z $TOKEN ]]; then
@ -67,7 +67,7 @@ for NAME in "${NODES[@]}"; do
promenade_health_check "${VIA}"
log "Validating documents"
ssh_cmd "${VIA}" curl -v "${CURL_ARGS[@]}" -X POST -H "Content-Type: application/json" -d "$(promenade_render_validate_body "${USE_DECKHAND}" "${DECKHAND_REVISION}")" "$(promenade_render_validate_url)"
ssh_cmd "${VIA}" curl "${CURL_ARGS[@]}" -X POST -H "Content-Type: application/json" -d "$(promenade_render_validate_body "${USE_DECKHAND}" "${DECKHAND_REVISION}")" "$(promenade_render_validate_url)"
JOIN_CURL_URL="$(promenade_render_curl_url "${NAME}" "${USE_DECKHAND}" "${DECKHAND_REVISION}" "${LABELS[@]}")"
log "Fetching join script via: ${JOIN_CURL_URL}"

View File

@ -6,15 +6,15 @@ source "${GATE_UTILS}"
VIA="n1"
CURL_ARGS=("--fail" "--max-time" "300" "--retry" "16" "--retry-delay" "15")
CURL_ARGS=("-v" "--max-time" "600" "--retry" "20" "--retry-delay" "15" "--connect-timeout" "30" "--progress-bar")
log Adding labels to node n0
log "Adding labels to node n0"
JSON="{\"calico-etcd\": \"enabled\", \"coredns\": \"enabled\", \"kubernetes-apiserver\": \"enabled\", \"kubernetes-controller-manager\": \"enabled\", \"kubernetes-etcd\": \"enabled\", \"kubernetes-scheduler\": \"enabled\", \"ucp-control-plane\": \"enabled\"}"
ssh_cmd "${VIA}" curl -v "${CURL_ARGS[@]}" -X PUT -H "Content-Type: application/json" -d "${JSON}" "$(promenade_put_labels_url n0)"
ssh_cmd "${VIA}" curl "${CURL_ARGS[@]}" -X PUT -H "Content-Type: application/json" -d "${JSON}" "$(promenade_put_labels_url n0)"
# Need to wait
sleep 60
sleep 120
validate_etcd_membership kubernetes n1 n0 n1 n2 n3
validate_etcd_membership calico n1 n0 n1 n2 n3
@ -22,10 +22,10 @@ validate_etcd_membership calico n1 n0 n1 n2 n3
log Removing labels from node n2
JSON="{\"coredns\": \"enabled\", \"ucp-control-plane\": \"enabled\"}"
ssh_cmd "${VIA}" curl -v "${CURL_ARGS[@]}" -X PUT -H "Content-Type: application/json" -d "${JSON}" "$(promenade_put_labels_url n2)"
ssh_cmd "${VIA}" curl "${CURL_ARGS[@]}" -X PUT -H "Content-Type: application/json" -d "${JSON}" "$(promenade_put_labels_url n2)"
# Need to wait
sleep 60
sleep 120
validate_cluster n1

View File

@ -8,8 +8,11 @@ declare -a NODES
RECREATE=0
while getopts "n:rv:" opt; do
while getopts "e:n:rv:" opt; do
case "${opt}" in
e)
ETCD_CLUSTERS+=("${OPTARG}")
;;
n)
NODES+=("${OPTARG}")
;;
@ -35,6 +38,9 @@ fi
for NAME in "${NODES[@]}"; do
log Tearing down node "${NAME}"
promenade_teardown_node "${NAME}" "${VIA}"
for ETCD_CLUSTER in "${ETCD_CLUSTERS[@]}"; do
etcdctl_member_remove "${ETCD_CLUSTER}" "${VIA}" "${NAME}"
done
vm_clean "${NAME}"
if [[ ${RECREATE} == "1" ]]; then
vm_create "${NAME}"