#!/bin/bash ################################################################################ # SYSTEM_HEALTH_CHECK.SH # ---------------------- # This script checks the system health and reports any problems on stdout # # Author: Robin Meier - robin@meier.si ################################################################################ logfile=/root/logs/system_health_check.log log_identifier="[SYS]" log() { echo -e $@ | ts "[%Y-%m-%d %H:%M:%S] $log_identifier" >> $logfile } log_echo() { echo -e $@ | ts "[%Y-%m-%d %H:%M:%S] $log_identifier" | tee -a $logfile } set -o allexport source /root/scripts/.system_health_check_env set +o allexport problems=0 log "Starting System Health Check" # RAM usage percentage ram=$(free | awk '/Mem/{printf("%.2f"), $3/$2*100}') if [ $(echo "$ram > $RAM_LIMT" | bc -l) -eq 1 ]; then log_echo "[RAM] usage is ${ram}%! (Limit: $RAM_LIMIT)" problems=1 else log "[RAM] usage is ${ram}%" fi # CPU usage percentage cpu=$(top -bn1 | grep "Cpu(s)" | awk '{print $2 + $4}') if [ $(echo "$cpu > $CPU_LIMIT" | bc -l) -eq 1 ]; then log_echo "[CPU] load is ${cpu}%! (Limit: $CPU_LIMIT)" problems=1 else log "[CPU] load is ${cpu}%" fi # Load? # TODO: Maybe check `load` # Temperature avg_cpu_temp=$(sensors | awk '/^Core /{++r; gsub(/[^[:digit:]]+/, "", $3); s+=$3} END{print s/(10*r)}') if [ $(echo "$avg_cpu_temp > $TEMP_LIMIT" | bc -l) -eq 1 ]; then log_echo "[TEMP] is ${avg_cpu_temp}°C! (Limit: $TEMP_LIMIT)" problems=1 else log "[TEMP] is ${avg_cpu_temp}°C" fi # Failed Services failed_services=$(systemctl status | grep -i 'Failed:\s.*\sunits' | awk '{print $2}') if [ $failed_services -eq 0 ]; then log "[SRV] No failed services" else log_echo "[SRV] [ERROR] Failed Services:" /bin/systemctl --failed | ts "[%Y-%m-%d %H:%M:%S] $log_identifier [SRV]" | tee -a $logfile log_echo "[SRV] You should run \"systemctl status/start \" to find out more or start the unit." problems=1 fi # Finish if [ ${problems} -eq 0 ]; then log "System Health Check Successful" else log_echo "System Health Check Found Problems" fi