[WIP - ubuntu-uplift] - support for ubuntu 18.04

Change-Id: Idaacd83e729a10f39e124bf3a788583a5d6c5e5b
This commit is contained in:
Yasin, Sirajudeen (SY495P) 2019-11-06 16:39:27 -07:00
parent a3c961efe6
commit 983a3e85f2
1 changed files with 257 additions and 0 deletions

View File

@ -59,6 +59,26 @@ else:
fi
}
die(){
set +x
# write to stderr any passed error message
if [[ $@ = *[!\ ]* ]]; then
echo "$@"
fi
echo "Backtrace:"
for ((i=0;i<${#FUNCNAME[@]}-1;i++)); do
echo $(caller $i)
done
echo __EXIT_MARKER__
# Exit after pipe closes to ensure all output is flushed first
while : ; do
echo "Waiting on exit..." || exit 1
done
}
export -f die
trap 'die' ERR
set -x
install_file(){
local path="$1"
@ -110,6 +130,8 @@ echo "Using rendered manifests file '$rendered_file'"
# env vars which can be set if you want to disable
: "${DISABLE_SECCOMP_PROFILE:=}"
: "${DISABLE_APPARMOR_PROFILES:=}"
: ${DISABLE_NTPD_SETUP:=}
: ${DISABLE_DNS_CONFIGURE:=}
###############################################################################
@ -171,3 +193,238 @@ if [[ ! $DISABLE_APPARMOR_PROFILES ]]; then
systemctl reload apparmor.service
fi
fi
###############################################################################
# Set DNS servers
###############################################################################
if [[ ! $DISABLE_DNS_CONFIGURE ]]; then
# Fetch upstream DNS servers
manifests_lookup "$rendered_file" "pegleg/CommonAddresses/v1" \
"common-addresses" "['data']['dns']['upstream_servers']"
dns_servers="$(echo $RESULT | tr '\n' ' ')"
echo "Upstream DNS server(s): $dns_servers"
# Fetch domain name (used for dns search)
manifests_lookup "$rendered_file" "pegleg/CommonAddresses/v1" \
"common-addresses" "['data']['dns']['node_domain']"
domain="$RESULT"
echo "Domain located: '$domain'"
# Update node DNS settings
resolv_conf=/etc/resolv.conf
new_resolv_conf="$(mktemp -u)"
for server in $dns_servers; do
if ! grep "nameserver $server" "$resolv_conf"; then
echo "nameserver $server" >> "$new_resolv_conf"
fi
done
if ! grep "domain $domain" "$resolv_conf"; then
echo "domain $domain" >> "$new_resolv_conf"
fi
if ! grep "search $domain" "$resolv_conf"; then
echo "search $domain" >> "$new_resolv_conf"
fi
if [[ -f $new_resolv_conf ]]; then
# preserve any existing settings, but they go to the end of the list
cat "$resolv_conf" >> "$new_resolv_conf"
cp "$new_resolv_conf" "$resolv_conf"
rm "$new_resolv_conf"
fi
# perform nslookup test
nslookup google.com > /dev/null || \
die "DNS lookup for google.com failed. Check your manifest DNS servers."
fi
###############################################################################
# Install and configure ntpd
###############################################################################
# The min number of NTP servers that should meet the below criteria
# NOTE: ntp expects a quorum of at least 3 servers
: ${NTPD_SERVER_COUNT_THRESHOLD:=3}
# The maximum allowed stratum
: ${NTPD_SERVER_STRATUM_THRESHOLD:=4}
# The maximum allowed offset (miliseconds, absolute value)
# NOTE: Ceph requires <= 50ms offset to function properly
: ${NTPD_SERVER_OFFSET_THRESHOLD:=50}
# The maximum allowed jitter (miliseconds, absolute value)
: ${NTPD_SERVER_JITTER_THRESHOLD:=25}
# The minimum allowed reach
: ${NTPD_SERVER_REACH_THRESHOLD:=377}
# The maximum number of seconds to wait for above criteria to be met
# NOTE: it takes 8 polling intervals to go from reach 0 to reach 377, and each
# polling interval is 64 seconds, so timeout should be at least this much.
: ${NTPD_HEALTH_TIMEOUT:=512}
# for ubuntu 18.04 LTS always restart ntp service
timedatectl set-ntp no
apt_install ntp
systemctl restart ntp
sleep 15
if [[ ! $DISABLE_NTPD_SETUP ]]; then
# Check if we are using a known bad kernel version that causes
# an inability to timesync to NTP servers.
if uname -a | grep '4.13.0-36-generic'; then
die "YOU ARE USING A KNOWN BAD KERNEL VERSION, AND MAY HAVE SKIPPED THE KERNEL UPDATE SECTION OF THIS SCRIPT. NTP WILL NOT RUN SUCCESSFULLY UNTIL THIS IS DONE."
fi
#apt_install ntp
apt_install bc
apt_install dnsutils #nslookup
# Remove Ubuntu pool entries to be consistent with MaaS deployed nodes
sed -i '/^pool.*ubuntu.*/d' /etc/ntp.conf
# Fetch NTP server list
#
# First try data.ntp.servers (new value in list format), then fallback
# to data.ntp.servers_joined (comma deliminted format) if needed.
manifests_lookup "$rendered_file" "pegleg/CommonAddresses/v1" \
"common-addresses" "['data']['ntp']['servers']" '' true
ntp_servers="$(echo "$RESULT" | tr '\n' ' ')"
if [[ $FAIL = true ]]; then
manifests_lookup "$rendered_file" "pegleg/CommonAddresses/v1" \
"common-addresses" "['data']['ntp']['servers_joined']"
ntp_servers="$(echo $RESULT | tr ',' ' ')"
fi
echo "Upstream NTP server(s): $ntp_servers"
restart=false
for ntp_server in $ntp_servers; do
svr_expr="server ${ntp_server} iburst"
if ! grep "^$svr_expr" /etc/ntp.conf; then
echo "$svr_expr" >> /etc/ntp.conf
restart=true
fi
done
if [[ $restart = true ]]; then
systemctl restart ntp
# takes some time for assoc IDs to get populated
sleep 15
fi
ntp_server_healthy_count=0
time_waited=0
sleep_interval=10
assids="$(ntpq -c associations | awk '{print $2}' | \
tr -cd '[:digit:]\n' | tr '\n' ' ')"
# Speed improvement / optimization - perform reverse DNS lookups up front
s_idx=0
for server in $ntp_servers; do
# Don't stop on DNS lookup failure
dns_info="$(nslookup $server)" || true
# First see if reverse lookup info is there (applicable to FQDNs), then see
# if straight DNS lookup info is there (applicable to IPs); mark as DNS
# lookup failure otherwise. TODO: Change to more robust dig commands.
server_alias_list[$s_idx]="$(echo "$dns_info" | grep Name: | head -1 | awk '{print $2}')" || \
server_alias_list[$s_idx]="$(echo "$dns_info" | grep 'name = ' | head -1 | awk '{print $4}' | sed s'/.$//')" || \
server_alias_list[$s_idx]=DNS_LOOKUP_FAILED
s_idx=$(($s_idx + 1))
done
# Speed improvement / optimization - perform mreadvar of srcadr up front,
# because each call performs a reverse DNS lookup, which can take a long
# time if preferred DNS servers in /etc/resolv.conf are hitting their
# timeout before trying the next one.
a_idx=0
for assid in $assids; do
if [[ -n $assid ]]; then
assid_srcadr_list[$a_idx]="$(ntpq -c "mreadvar $assid $assid srcadr" | cut -d'=' -f2)"
a_idx=$(($a_idx + 1))
fi
done
a_idx=0
for assid in $assids; do
if [[ -n $assid ]]; then
ntpq_reported_srcadr="${assid_srcadr_list[$a_idx]}"
a_idx=$(($a_idx + 1))
s_idx=0
for server in $ntp_servers; do
server_alias="${server_alias_list[$s_idx]}"
s_idx=$(($s_idx + 1))
if [[ $ntpq_reported_srcadr = $server ]] || \
[[ $ntpq_reported_srcadr = $server_alias ]]; then
stratum="$(ntpq -c "mreadvar $assid $assid stratum" | cut -d'=' -f2)"
if [[ $stratum -gt $NTPD_SERVER_STRATUM_THRESHOLD ]]; then
echo "NTP server '$server' with stratum '$stratum' did not meet the stratum threadhold '$NTPD_SERVER_STRATUM_THRESHOLD'"
echo "If you're certain you wish to use this time server, then increase the NTPD_SERVER_STRATUM_THRESHOLD."
echo "NTP server exceeds defined stratum threshold."
continue
fi
failed_server=false
while [[ $(ntpq -c "mreadvar $assid $assid reach" | cut -d'=' -f2) -lt \
$NTPD_SERVER_REACH_THRESHOLD ]]; do
if [[ $time_waited -ge $NTPD_HEALTH_TIMEOUT ]]; then
echo "NTP server '$server' did not achieve desired reach within timeout period '$NTPD_HEALTH_TIMEOUT' seconds."
failed_server=true
break
fi
echo "NTP reach threshold not achieved yet for '$server'"
echo "reach is '$(ntpq -c "mreadvar $assid $assid reach" | cut -d'=' -f2)', expected >= '$NTPD_SERVER_REACH_THRESHOLD'"
echo "Waiting up to '$NTPD_HEALTH_TIMEOUT's for reach >= threshold value."
sleep $sleep_interval
time_waited=$(($time_waited + $sleep_interval))
done
if [[ $failed_server = true ]]; then continue; fi
# use bc for floating point comparison
# With bc output, 0 = false, 1 = true (opposite of return codes)
while [[ $(echo "$(ntpq -c "mreadvar $assid $assid jitter" | cut -d'=' -f2) <= $NTPD_SERVER_JITTER_THRESHOLD" | bc -l) = 0 ]]; do
if [[ $time_waited -ge $NTPD_HEALTH_TIMEOUT ]]; then
echo "Jitter is above threshold."
echo "A high jitter indicates an inability to converge time with this time server."
echo "IN MOST CASES, ISSUE WILL RESOLVE WITHIN A COUPLE OF HOURS - monitor for decreasing jitter values with 'ntpq -p'."
echo "If error persists, troubleshoot the network connection to timeserver."
echo "NTP server '$server' did not achieve desired jitter within timeout period '$NTPD_HEALTH_TIMEOUT' seconds."
failed_server=true
break
fi
echo "NTP jitter threshold not achieved yet for '$server'"
echo "jitter is '$(ntpq -c "mreadvar $assid $assid jitter" | cut -d '=' -f2)', expected <= '$NTPD_SERVER_JITTER_THRESHOLD'"
echo "Waiting up to '$NTPD_HEALTH_TIMEOUT's for offset >= threshold value."
sleep $sleep_interval
time_waited=$(($time_waited + $sleep_interval))
done
if [[ $failed_server = true ]]; then continue; fi
# use bc for floating point comparison
# With bc output, 0 = false, 1 = true (opposite of return codes)
# sqrt(x^2) used to get absolute value, since offset may be + or -
while [[ $(echo "sqrt($(ntpq -c "mreadvar $assid $assid offset" | cut -d '=' -f2)^2) <= $NTPD_SERVER_OFFSET_THRESHOLD" | bc -l) = 0 ]]; do
if [[ $time_waited -ge $NTPD_HEALTH_TIMEOUT ]]; then
echo "NTP server '$server' did not achieve desired offset within timeout period '$NTPD_HEALTH_TIMEOUT' seconds."
failed_server=true
break
fi
echo "NTP offset threshold not achieved yet for '$server'"
echo "offset is '$(ntpq -c "mreadvar $assid $assid offset" | cut -d'=' -f2)', expected >= '$NTPD_SERVER_OFFSET_THRESHOLD'"
echo "Waiting up to '$NTPD_HEALTH_TIMEOUT's for offset >= threshold value."
sleep $sleep_interval
time_waited=$(($time_waited + $sleep_interval))
done
if [[ $failed_server = true ]]; then continue; fi
ntp_server_healthy_count=$(($ntp_server_healthy_count + 1))
fi
# Break out once threshold is met
if [[ $ntp_server_healthy_count -ge $NTPD_SERVER_COUNT_THRESHOLD ]]; then
break
fi
done
fi
# Break out once threshold is met
if [[ $ntp_server_healthy_count -ge $NTPD_SERVER_COUNT_THRESHOLD ]]; then
break
fi
done
if [[ $ntp_server_healthy_count -lt $NTPD_SERVER_COUNT_THRESHOLD ]]; then
echo "Expected '$NTPD_SERVER_COUNT_THRESHOLD' healthy NTP servers, but got '$ntp_server_healthy_count'"
die "Did not satisfy the minimum number of healthy NTP servers. See https://wiki.web.att.com/display/CCPdev/Network+Cloud+Troubleshooting#NetworkCloudTroubleshooting-TroubleshootingTimeSyncIssues"
fi
fi