[WIP - ubuntu-uplift] - support for ubuntu 18.04

Change-Id: Idaacd83e729a10f39e124bf3a788583a5d6c5e5b
2019-11-06 16:39:27 -07:00 · 2019-11-06 16:39:27 -07:00 · 983a3e85f2
parent a3c961efe6
commit 983a3e85f2
1 changed files with 257 additions and 0 deletions
--- a/tools/multi_nodes_gate/airship_gate/lib/bootaction-runner.sh
+++ b/tools/multi_nodes_gate/airship_gate/lib/bootaction-runner.sh
@ -59,6 +59,26 @@ else:
  fi
 }

+die(){
+  set +x
+  # write to stderr any passed error message
+  if [[ $@ = *[!\ ]* ]]; then
+    echo "$@"
+  fi
+  echo "Backtrace:"
+  for ((i=0;i<${#FUNCNAME[@]}-1;i++)); do
+    echo $(caller $i)
+  done
+  echo __EXIT_MARKER__
+  # Exit after pipe closes to ensure all output is flushed first
+  while : ; do
+    echo "Waiting on exit..." || exit 1
+  done
+}
+export -f die
+trap 'die' ERR
+set -x
+

 install_file(){
  local path="$1"
@ -110,6 +130,8 @@ echo "Using rendered manifests file '$rendered_file'"
 # env vars which can be set if you want to disable
 : "${DISABLE_SECCOMP_PROFILE:=}"
 : "${DISABLE_APPARMOR_PROFILES:=}"
+: ${DISABLE_NTPD_SETUP:=}
+: ${DISABLE_DNS_CONFIGURE:=}


 ###############################################################################
@ -171,3 +193,238 @@ if [[ ! $DISABLE_APPARMOR_PROFILES ]]; then
    systemctl reload apparmor.service
  fi
 fi
+
+###############################################################################
+# Set DNS servers
+###############################################################################
+
+if [[ ! $DISABLE_DNS_CONFIGURE ]]; then
+  # Fetch upstream DNS servers
+  manifests_lookup "$rendered_file" "pegleg/CommonAddresses/v1" \
+                   "common-addresses" "['data']['dns']['upstream_servers']"
+  dns_servers="$(echo $RESULT | tr '\n' ' ')"
+  echo "Upstream DNS server(s): $dns_servers"
+
+  # Fetch domain name (used for dns search)
+  manifests_lookup "$rendered_file" "pegleg/CommonAddresses/v1" \
+                   "common-addresses" "['data']['dns']['node_domain']"
+  domain="$RESULT"
+  echo "Domain located: '$domain'"
+
+  # Update node DNS settings
+  resolv_conf=/etc/resolv.conf
+  new_resolv_conf="$(mktemp -u)"
+  for server in $dns_servers; do
+    if ! grep "nameserver $server" "$resolv_conf"; then
+      echo "nameserver $server" >> "$new_resolv_conf"
+    fi
+  done
+  if ! grep "domain $domain" "$resolv_conf"; then
+    echo "domain $domain" >> "$new_resolv_conf"
+  fi
+  if ! grep "search $domain" "$resolv_conf"; then
+    echo "search $domain" >> "$new_resolv_conf"
+  fi
+  if [[ -f $new_resolv_conf ]]; then
+    # preserve any existing settings, but they go to the end of the list
+    cat "$resolv_conf" >> "$new_resolv_conf"
+    cp "$new_resolv_conf" "$resolv_conf"
+    rm "$new_resolv_conf"
+  fi
+
+  # perform nslookup test
+  nslookup google.com > /dev/null || \
+    die "DNS lookup for google.com failed. Check your manifest DNS servers."
+fi
+
+
+###############################################################################
+# Install and configure ntpd
+###############################################################################
+
+# The min number of NTP servers that should meet the below criteria
+# NOTE: ntp expects a quorum of at least 3 servers
+: ${NTPD_SERVER_COUNT_THRESHOLD:=3}
+
+# The maximum allowed stratum
+: ${NTPD_SERVER_STRATUM_THRESHOLD:=4}
+
+# The maximum allowed offset (miliseconds, absolute value)
+# NOTE: Ceph requires <= 50ms offset to function properly
+: ${NTPD_SERVER_OFFSET_THRESHOLD:=50}
+
+# The maximum allowed jitter (miliseconds, absolute value)
+: ${NTPD_SERVER_JITTER_THRESHOLD:=25}
+
+# The minimum allowed reach
+: ${NTPD_SERVER_REACH_THRESHOLD:=377}
+
+# The maximum number of seconds to wait for above criteria to be met
+# NOTE: it takes 8 polling intervals to go from reach 0 to reach 377, and each
+# polling interval is 64 seconds, so timeout should be at least this much.
+: ${NTPD_HEALTH_TIMEOUT:=512}
+
+# for ubuntu 18.04 LTS always restart ntp service
+timedatectl set-ntp no
+apt_install ntp
+systemctl restart ntp
+sleep 15
+
+if [[ ! $DISABLE_NTPD_SETUP ]]; then
+  # Check if we are using a known bad kernel version that causes
+  # an inability to timesync to NTP servers.
+  if uname -a | grep '4.13.0-36-generic'; then
+    die "YOU ARE USING A KNOWN BAD KERNEL VERSION, AND MAY HAVE SKIPPED THE KERNEL UPDATE SECTION OF THIS SCRIPT. NTP WILL NOT RUN SUCCESSFULLY UNTIL THIS IS DONE."
+  fi
+
+  #apt_install ntp
+  apt_install bc
+  apt_install dnsutils #nslookup
+
+  # Remove Ubuntu pool entries to be consistent with MaaS deployed nodes
+  sed -i '/^pool.*ubuntu.*/d' /etc/ntp.conf
+
+  # Fetch NTP server list
+  #
+  # First try data.ntp.servers (new value in list format), then fallback
+  # to data.ntp.servers_joined (comma deliminted format) if needed.
+  manifests_lookup "$rendered_file" "pegleg/CommonAddresses/v1" \
+                   "common-addresses" "['data']['ntp']['servers']" '' true
+  ntp_servers="$(echo "$RESULT" | tr '\n' ' ')"
+  if [[ $FAIL = true ]]; then
+    manifests_lookup "$rendered_file" "pegleg/CommonAddresses/v1" \
+                     "common-addresses" "['data']['ntp']['servers_joined']"
+    ntp_servers="$(echo $RESULT | tr ',' ' ')"
+  fi
+  echo "Upstream NTP server(s): $ntp_servers"
+
+  restart=false
+  for ntp_server in $ntp_servers; do
+    svr_expr="server ${ntp_server} iburst"
+    if ! grep "^$svr_expr" /etc/ntp.conf; then
+      echo "$svr_expr" >> /etc/ntp.conf
+      restart=true
+    fi
+  done
+  if [[ $restart = true ]]; then
+    systemctl restart ntp
+    # takes some time for assoc IDs to get populated
+    sleep 15
+  fi
+
+  ntp_server_healthy_count=0
+  time_waited=0
+  sleep_interval=10
+  assids="$(ntpq -c associations | awk '{print $2}' | \
+                                   tr -cd '[:digit:]\n' | tr '\n' ' ')"
+  # Speed improvement / optimization - perform reverse DNS lookups up front
+  s_idx=0
+  for server in $ntp_servers; do
+    # Don't stop on DNS lookup failure
+    dns_info="$(nslookup $server)" || true
+    # First see if reverse lookup info is there (applicable to FQDNs), then see
+    # if straight DNS lookup info is there (applicable to IPs); mark as DNS
+    # lookup failure otherwise. TODO: Change to more robust dig commands.
+    server_alias_list[$s_idx]="$(echo "$dns_info" | grep Name: | head -1 | awk '{print $2}')" || \
+    server_alias_list[$s_idx]="$(echo "$dns_info" | grep 'name = ' | head -1 | awk '{print $4}' | sed s'/.$//')" || \
+    server_alias_list[$s_idx]=DNS_LOOKUP_FAILED
+    s_idx=$(($s_idx + 1))
+  done
+  # Speed improvement / optimization - perform mreadvar of srcadr up front,
+  # because each call performs a reverse DNS lookup, which can take a long
+  # time if preferred DNS servers in /etc/resolv.conf are hitting their
+  # timeout before trying the next one.
+  a_idx=0
+  for assid in $assids; do
+    if [[ -n $assid ]]; then
+      assid_srcadr_list[$a_idx]="$(ntpq -c "mreadvar $assid $assid srcadr" | cut -d'=' -f2)"
+      a_idx=$(($a_idx + 1))
+    fi
+  done
+
+  a_idx=0
+  for assid in $assids; do
+    if [[ -n $assid ]]; then
+      ntpq_reported_srcadr="${assid_srcadr_list[$a_idx]}"
+      a_idx=$(($a_idx + 1))
+      s_idx=0
+      for server in $ntp_servers; do
+        server_alias="${server_alias_list[$s_idx]}"
+        s_idx=$(($s_idx + 1))
+        if [[ $ntpq_reported_srcadr = $server ]] || \
+           [[ $ntpq_reported_srcadr = $server_alias ]]; then
+          stratum="$(ntpq -c "mreadvar $assid $assid stratum" | cut -d'=' -f2)"
+          if [[ $stratum -gt $NTPD_SERVER_STRATUM_THRESHOLD ]]; then
+            echo "NTP server '$server' with stratum '$stratum' did not meet the stratum threadhold '$NTPD_SERVER_STRATUM_THRESHOLD'"
+            echo "If you're certain you wish to use this time server, then increase the NTPD_SERVER_STRATUM_THRESHOLD."
+            echo "NTP server exceeds defined stratum threshold."
+            continue
+          fi
+          failed_server=false
+          while [[ $(ntpq -c "mreadvar $assid $assid reach" | cut -d'=' -f2) -lt \
+                $NTPD_SERVER_REACH_THRESHOLD ]]; do
+            if [[ $time_waited -ge $NTPD_HEALTH_TIMEOUT ]]; then
+              echo "NTP server '$server' did not achieve desired reach within timeout period '$NTPD_HEALTH_TIMEOUT' seconds."
+              failed_server=true
+              break
+            fi
+            echo "NTP reach threshold not achieved yet for '$server'"
+            echo "reach is '$(ntpq -c "mreadvar $assid $assid reach" | cut -d'=' -f2)', expected >= '$NTPD_SERVER_REACH_THRESHOLD'"
+            echo "Waiting up to '$NTPD_HEALTH_TIMEOUT's for reach >= threshold value."
+            sleep $sleep_interval
+            time_waited=$(($time_waited + $sleep_interval))
+          done
+          if [[ $failed_server = true ]]; then continue; fi
+          # use bc for floating point comparison
+          # With bc output, 0 = false, 1 = true (opposite of return codes)
+          while [[ $(echo "$(ntpq -c "mreadvar $assid $assid jitter" | cut -d'=' -f2) <= $NTPD_SERVER_JITTER_THRESHOLD" | bc -l) = 0 ]]; do
+            if [[ $time_waited -ge $NTPD_HEALTH_TIMEOUT ]]; then
+              echo "Jitter is above threshold."
+              echo "A high jitter indicates an inability to converge time with this time server."
+              echo "IN MOST CASES, ISSUE WILL RESOLVE WITHIN A COUPLE OF HOURS - monitor for decreasing jitter values with 'ntpq -p'."
+              echo "If error persists, troubleshoot the network connection to timeserver."
+              echo "NTP server '$server' did not achieve desired jitter within timeout period '$NTPD_HEALTH_TIMEOUT' seconds."
+              failed_server=true
+              break
+            fi
+            echo "NTP jitter threshold not achieved yet for '$server'"
+            echo "jitter is '$(ntpq -c "mreadvar $assid $assid jitter" | cut -d '=' -f2)', expected <= '$NTPD_SERVER_JITTER_THRESHOLD'"
+            echo "Waiting up to '$NTPD_HEALTH_TIMEOUT's for offset >= threshold value."
+            sleep $sleep_interval
+            time_waited=$(($time_waited + $sleep_interval))
+          done
+          if [[ $failed_server = true ]]; then continue; fi
+          # use bc for floating point comparison
+          # With bc output, 0 = false, 1 = true (opposite of return codes)
+          # sqrt(x^2) used to get absolute value, since offset may be + or -
+          while [[ $(echo "sqrt($(ntpq -c "mreadvar $assid $assid offset" | cut -d '=' -f2)^2) <= $NTPD_SERVER_OFFSET_THRESHOLD" | bc -l) = 0 ]]; do
+            if [[ $time_waited -ge $NTPD_HEALTH_TIMEOUT ]]; then
+              echo "NTP server '$server' did not achieve desired offset within timeout period '$NTPD_HEALTH_TIMEOUT' seconds."
+              failed_server=true
+              break
+            fi
+            echo "NTP offset threshold not achieved yet for '$server'"
+            echo "offset is '$(ntpq -c "mreadvar $assid $assid offset" | cut -d'=' -f2)', expected >= '$NTPD_SERVER_OFFSET_THRESHOLD'"
+            echo "Waiting up to '$NTPD_HEALTH_TIMEOUT's for offset >= threshold value."
+            sleep $sleep_interval
+            time_waited=$(($time_waited + $sleep_interval))
+          done
+          if [[ $failed_server = true ]]; then continue; fi
+          ntp_server_healthy_count=$(($ntp_server_healthy_count + 1))
+        fi
+        # Break out once threshold is met
+        if [[ $ntp_server_healthy_count -ge $NTPD_SERVER_COUNT_THRESHOLD ]]; then
+          break
+        fi
+      done
+    fi
+    # Break out once threshold is met
+    if [[ $ntp_server_healthy_count -ge $NTPD_SERVER_COUNT_THRESHOLD ]]; then
+      break
+    fi
+  done
+  if [[ $ntp_server_healthy_count -lt $NTPD_SERVER_COUNT_THRESHOLD ]]; then
+    echo "Expected '$NTPD_SERVER_COUNT_THRESHOLD' healthy NTP servers, but got '$ntp_server_healthy_count'"
+    die "Did not satisfy the minimum number of healthy NTP servers. See https://wiki.web.att.com/display/CCPdev/Network+Cloud+Troubleshooting#NetworkCloudTroubleshooting-TroubleshootingTimeSyncIssues"
+  fi
+fi