#!/bin/bash ################################################################################ # SYSTEM_HEALTH_CHECK.SH # ---------------------- # This script checks the system health and reports any problems on stdout # # Author: Robin Meier - robin@meier.si ################################################################################ script_dir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # Load configuration set -o allexport source ${script_dir}/config/system_health_check set +o allexport # Import logging functionality logfile=${script_dir}/log/system_health_check.log log_identifier="SYS" source ${script_dir}/functions/logging.sh problems=0 log "Starting System Health Check" # RAM usage percentage ram=$(free | awk '/Mem/{printf("%.2f"), $3/$2*100}') if [ $(echo "$ram > $RAM_LIMIT" | bc -l) -eq 1 ]; then log_echo "[RAM] usage is abobe limit of ${RAM_LIMIT}%!" log "[RAM] usage is ${ram}%! (Limit: $RAM_LIMIT)" problems=1 else log "[RAM] usage is ${ram}%" fi # CPU usage percentage cpu=$(top -bn1 | grep "Cpu(s)" | awk '{print $2 + $4}') if [ $(echo "$cpu > $CPU_LIMIT" | bc -l) -eq 1 ]; then log_echo "[CPU] load is above limit of ${CPU_LIMIT}%!" log "[CPU] load is ${cpu}%! (Limit: $CPU_LIMIT)" problems=1 else log "[CPU] load is ${cpu}%" fi # Load? # TODO: Maybe check `load` # Temperature avg_cpu_temp=$(sensors | awk '/^Core /{++r; gsub(/[^[:digit:]]+/, "", $3); s+=$3} END{print s/(10*r)}') if [ $(echo "$avg_cpu_temp > $TEMP_LIMIT" | bc -l) -eq 1 ]; then log_echo "[TEMP] is above limit of ${TEMP_LIMIT}°C!" log "[TEMP] is ${avg_cpu_temp}°C! (Limit: $TEMP_LIMIT)" problems=1 else log "[TEMP] is ${avg_cpu_temp}°C" fi # Failed Services failed_services=$(systemctl status | grep -i 'Failed:\s.*\sunits' | awk '{print $2}') if [ $failed_services -eq 0 ]; then log "[SRV] No failed services" else log_echo "[SRV] [ERROR] Failed Services:" /bin/systemctl --failed | ts "[%Y-%m-%d %H:%M:%S] $log_identifier [SRV]" | tee -a $logfile log_echo "[SRV] You should run \"systemctl status/start \" to find out more or start the unit." problems=1 fi # Finish if [ ${problems} -eq 0 ]; then log "System Health Check Successful" else log "System Health Check Found Problems" fi