admin-scripts/system_health_check.sh

77 lines
2.2 KiB
Bash
Executable File

#!/bin/bash
################################################################################
# SYSTEM_HEALTH_CHECK.SH
# ----------------------
# This script checks the system health and reports any problems on stdout
#
# Author: Robin Meier - robin@meier.si
################################################################################
script_dir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
# Load configuration
set -o allexport
source ${script_dir}/config/system_health_check
set +o allexport
# Import logging functionality
logfile=${script_dir}/log/system_health_check.log
log_identifier="SYS"
source ${script_dir}/functions/logging.sh
problems=0
log "Starting System Health Check"
# RAM usage percentage
ram=$(free | awk '/Mem/{printf("%.2f"), $3/$2*100}')
if [ $(echo "$ram > $RAM_LIMIT" | bc -l) -eq 1 ]; then
log_echo "[RAM] usage is abobe limit of ${RAM_LIMIT}%!"
log "[RAM] usage is ${ram}%! (Limit: $RAM_LIMIT)"
problems=1
else
log "[RAM] usage is ${ram}%"
fi
# CPU usage percentage
cpu=$(top -bn1 | grep "Cpu(s)" | awk '{print $2 + $4}')
if [ $(echo "$cpu > $CPU_LIMIT" | bc -l) -eq 1 ]; then
log_echo "[CPU] load is above limit of ${CPU_LIMIT}%!"
log "[CPU] load is ${cpu}%! (Limit: $CPU_LIMIT)"
problems=1
else
log "[CPU] load is ${cpu}%"
fi
# Load?
# TODO: Maybe check `load`
# Temperature
avg_cpu_temp=$(sensors | awk '/^Core /{++r; gsub(/[^[:digit:]]+/, "", $3); s+=$3} END{print s/(10*r)}')
if [ $(echo "$avg_cpu_temp > $TEMP_LIMIT" | bc -l) -eq 1 ]; then
log_echo "[TEMP] is above limit of ${TEMP_LIMIT}°C!"
log "[TEMP] is ${avg_cpu_temp}°C! (Limit: $TEMP_LIMIT)"
problems=1
else
log "[TEMP] is ${avg_cpu_temp}°C"
fi
# Failed Services
failed_services=$(systemctl status | grep -i 'Failed:\s.*\sunits' | awk '{print $2}')
if [ $failed_services -eq 0 ]; then
log "[SRV] No failed services"
else
log_echo "[SRV] [ERROR] Failed Services:"
/bin/systemctl --failed | ts "[%Y-%m-%d %H:%M:%S] $log_identifier [SRV]" | tee -a $logfile
log_echo "[SRV] You should run \"systemctl status/start <servicename>\" to find out more or start the unit."
problems=1
fi
# Finish
if [ ${problems} -eq 0 ]; then
log "System Health Check Successful"
else
log "System Health Check Found Problems"
fi