commit 5826d924d63218d0536f9e29c72d60cade115da3 Author: Robi Meier Date: Fri Mar 15 22:07:08 2024 +0100 Initial commit diff --git a/.dyndns_env.EXAMPLE b/.dyndns_env.EXAMPLE new file mode 100644 index 0000000..1876e69 --- /dev/null +++ b/.dyndns_env.EXAMPLE @@ -0,0 +1,4 @@ +USERNAME=username +PASSWORD=password +MAIN_DOMAIN="ip.mydomain.com" +ADDITIONAL_DOMAINS="mydomain.com otherdomain.com www.otherdomain.com" diff --git a/.file_monitor_env.EXAMPLE b/.file_monitor_env.EXAMPLE new file mode 100644 index 0000000..581ebae --- /dev/null +++ b/.file_monitor_env.EXAMPLE @@ -0,0 +1 @@ +FILES="/root/testfile.txt" diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..90dfe7d --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.*_env + +storage/ + +# This file is unused atm +docker_health_check.sh diff --git a/.monitoring_env.EXAMPLE b/.monitoring_env.EXAMPLE new file mode 100644 index 0000000..d374fbb --- /dev/null +++ b/.monitoring_env.EXAMPLE @@ -0,0 +1,2 @@ +SSH_MONITORING="192.168.1.1:22 example.com:2822" +HTTP_MONITORING="http://wimi.meier.si https://google.com https://username:password@some-httpauth-protected.site" diff --git a/.post_startup_env.EXAMPLE b/.post_startup_env.EXAMPLE new file mode 120000 index 0000000..114b036 --- /dev/null +++ b/.post_startup_env.EXAMPLE @@ -0,0 +1 @@ +/home/robin/.post_startup_env \ No newline at end of file diff --git a/.system_health_check_env.EXAMPLE b/.system_health_check_env.EXAMPLE new file mode 120000 index 0000000..aa08efe --- /dev/null +++ b/.system_health_check_env.EXAMPLE @@ -0,0 +1 @@ +.system_health_check_env \ No newline at end of file diff --git a/.telegram_notification_env.EXAMPLE b/.telegram_notification_env.EXAMPLE new file mode 100644 index 0000000..0260f93 --- /dev/null +++ b/.telegram_notification_env.EXAMPLE @@ -0,0 +1,2 @@ +BOT_TOKEN=987654321:ABD-jTsfGp23cptjUsSv8md0sVjnFeCd8g +CHAT_ID=12345678 diff --git a/.zfs_health_check_env.EXAMPLE b/.zfs_health_check_env.EXAMPLE new file mode 120000 index 0000000..79d0d31 --- /dev/null +++ b/.zfs_health_check_env.EXAMPLE @@ -0,0 +1 @@ +.zfs_health_check_env \ No newline at end of file diff --git a/dyndns.sh b/dyndns.sh new file mode 100755 index 0000000..485399b --- /dev/null +++ b/dyndns.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +logfile=/root/logs/dyndns.log +log_identifier="[DNS]" +log() { + echo -e $@ | ts "[%Y-%m-%d %H:%M:%S] $log_identifier" >> $logfile +} +log_echo() { + echo -e $@ | ts "[%Y-%m-%d %H:%M:%S] $log_identifier" | tee -a $logfile +} + +set -o allexport +source /root/scripts/.dyndns_env +set +o allexport + +url="https://${USERNAME}:${PASSWORD}@infomaniak.com/nic/update?hostname=" + +log "Updating DynDNS for ${MAIN_DOMAIN}" + +response=$(curl -s -f "${url}${MAIN_DOMAIN}") +if [[ $? -ne 0 ]]; then + log_echo "[ERROR] ${MAIN_DOMAIN} DynDNS Request Failed!" + exit 1 +fi + +log "Response: ${response}" + +if [[ "$response" =~ ^nochg ]]; then + # IP has not changed + # log_echo "IP has not changed, is still $(echo $response | awk '{print $2}')" + exit 0 + +elif [[ "$response" =~ ^good ]]; then + # IP has changed + log_echo "IP HAS CHANGED TO $(echo $response | awk '{print $2}')" + log_echo "${MAIN_DOMAIN} was updated successfully" + + domains_error=0 + for $domain in $ADDITIONAL_DOMAINS + do + log "Updating DynDNS for ${domain}" + additional_response=$(curl -s -f "${url}mneun.ch") + if [[ $? -ne 0 ]]; then + log_echo "[ERROR] ${domain} DynDNS Request Failed!" + exit 1 + fi + + log "${domain} response: ${additional_response}" + + if [[ "$additional_response" =~ ^good ]]; then + # Change succeeded + log_echo "${domain} was updated successfully" + elif [[ "$additional_response" =~ ^nochg ]]; then + log_echo "${domain} did not change" + else + log_echo "[ERROR] ${domain} DynDNS request response does not match expectations!" + domains_error=1 + fi + sleep 1 + done + + if [ $domain_error -eq 1 ]; then + exit 2 + fi +else + log_echo "[ERROR] ${MAIN_DOMAIN} DynDNS request respone does not match expectations!" + exit 2 +fi diff --git a/file_monitor.sh b/file_monitor.sh new file mode 100755 index 0000000..d4c7c6f --- /dev/null +++ b/file_monitor.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +################################################################################ +# FILE_MONITOR.SH +# --------------- +# This script checks the docker health and reports any problems on stdout +# +# Author: Robin Meier - robin@meier.si +################################################################################ + +logfile=/root/logs/file_monitor.log +log_identifier="[FILE]" +log() { + echo -e $@ | ts "[%Y-%m-%d %H:%M:%S] $log_identifier" >> $logfile +} +log_echo() { + echo -e $@ | ts "[%Y-%m-%d %H:%M:%S] $log_identifier" | tee -a $logfile +} + +set -o allexport +source /root/scripts/.file_monitor_env +set +o allexport + +mkdir -p /root/scripts/storage/file_monitor + +for file in $FILES +do +# Touch storage file if not existing + if [ ! -f /root/scripts/storage/file_monitor/${file//\//_} ]; then + touch /root/scripts/storage/file_monitor/${file//\//_} ]; + fi + + if [ "$file" -nt "/root/scripts/storage/file_monitor/${file//\//_}" ]; then + log_echo "[CHANGE] $file" + touch /root/scripts/storage/file_monitor/${file//\//_} ]; + fi +done diff --git a/monitoring.sh b/monitoring.sh new file mode 100755 index 0000000..e8cd89e --- /dev/null +++ b/monitoring.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +logfile=/root/logs/monitoring.log +log_identifier="[MON]" +log() { + echo -e $@ | ts "[%Y-%m-%d %H:%M:%S] $log_identifier" >> $logfile +} +log_echo() { + echo -e $@ | ts "[%Y-%m-%d %H:%M:%S] $log_identifier" | tee -a $logfile +} + +set -o allexport +source /root/scripts/.monitoring_env +set +o allexport + + +problems=0 + +log "Starting monitoring run" + +for ssh_host in $SSH_MONITORING +do + if [[ $(nc -w 2 ${ssh_host//:/ } <<< "\0" ) =~ "OpenSSH" ]] ; then + log "[SSH] [OK] ${ssh_host} is reachable" + else + # TODO: Rate limit fail messages, also add is back up message + log_echo "[SSH] [FAIL] ${ssh_host} not reachable" + problems=1 + fi +done + +# TODO: HTTP Status Code 200 Monitoring +for http_host in $HTTP_MONITORING +do + status_code=$(curl --write-out %{http_code} --silent --output /dev/null $http_host) + if [[ "$status_code" -eq 200 ]] ; then + log "[WEB] [OK] ${http_host}" + else + # TODO: Rate limit fail messages, also add is back up message + log_echo "[WEB] [FAIL] ${http_host} status code is ${status_code}" + problems=1 + fi +done + + +if [[ "$problems" -eq "0" ]]; then + log "Monitoring Run Successful" +else + log_echo "Monitoring Run Failed" +fi diff --git a/post_startup.sh b/post_startup.sh new file mode 120000 index 0000000..1ff690e --- /dev/null +++ b/post_startup.sh @@ -0,0 +1 @@ +/home/robin/post_startup.sh \ No newline at end of file diff --git a/system_health_check.sh b/system_health_check.sh new file mode 100755 index 0000000..d11bd79 --- /dev/null +++ b/system_health_check.sh @@ -0,0 +1,75 @@ +#!/bin/bash + +################################################################################ +# SYSTEM_HEALTH_CHECK.SH +# ---------------------- +# This script checks the system health and reports any problems on stdout +# +# Author: Robin Meier - robin@meier.si +################################################################################ + +logfile=/root/logs/system_health_check.log +log_identifier="[SYS]" +log() { + echo -e $@ | ts "[%Y-%m-%d %H:%M:%S] $log_identifier" >> $logfile +} +log_echo() { + echo -e $@ | ts "[%Y-%m-%d %H:%M:%S] $log_identifier" | tee -a $logfile +} + +set -o allexport +source /root/scripts/.system_health_check_env +set +o allexport + + +problems=0 + +log "Starting System Health Check" + +# RAM usage percentage +ram=$(free | awk '/Mem/{printf("%.2f"), $3/$2*100}') +if [ $(echo "$ram > $RAM_LIMT" | bc -l) -eq 1 ]; then + log_echo "[RAM] usage is ${ram}%! (Limit: $RAM_LIMIT)" + problems=1 +else + log "[RAM] usage is ${ram}%" +fi + +# CPU usage percentage +cpu=$(top -bn1 | grep "Cpu(s)" | awk '{print $2 + $4}') +if [ $(echo "$cpu > $CPU_LIMIT" | bc -l) -eq 1 ]; then + log_echo "[CPU] load is ${cpu}%! (Limit: $CPU_LIMIT)" + problems=1 +else + log "[CPU] load is ${cpu}%" +fi + +# Load? +# TODO: Maybe check `load` + +# Temperature +avg_cpu_temp=$(sensors | awk '/^Core /{++r; gsub(/[^[:digit:]]+/, "", $3); s+=$3} END{print s/(10*r)}') +if [ $(echo "$avg_cpu_temp > $TEMP_LIMIT" | bc -l) -eq 1 ]; then + log_echo "[TEMP] is ${avg_cpu_temp}°C! (Limit: $TEMP_LIMIT)" + problems=1 +else + log "[TEMP] is ${avg_cpu_temp}°C" +fi + +# Failed Services +failed_services=$(systemctl status | grep -i 'Failed:\s.*\sunits' | awk '{print $2}') +if [ $failed_services -eq 0 ]; then + log "[SRV] No failed services" +else + log_echo "[SRV] [ERROR] Failed Services:" + /bin/systemctl --failed | ts "[%Y-%m-%d %H:%M:%S] $log_identifier [SRV]" | tee -a $logfile + log_echo "[SRV] You should run \"systemctl status/start \" to find out more or start the unit." + problems=1 +fi + +# Finish +if [ ${problems} -eq 0 ]; then + log "System Health Check Successful" +else + log_echo "System Health Check Found Problems" +fi diff --git a/telegram_notification.sh b/telegram_notification.sh new file mode 100755 index 0000000..b7b100a --- /dev/null +++ b/telegram_notification.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +################################################################################ +# TELEGRAM_NOTIFICATION.SH +# ------------------------ +# This script takes input via stdin or parameters, removes timestamps from each +# line and replaces newlines with telegram compatible ones and then sends the +# message to a chat +# +# Author: Robin Meier - robin@meier.si +################################################################################ + +set -o allexport +source /root/scripts/.telegram_notification_env +set +o allexport + +BOT_API_URL=https://api.telegram.org/bot${BOT_TOKEN} + +# Get input from standard input or via first parameter +if [[ $# -eq 0 ]]; then + MESSAGE=$(timeout 30 cat) +elif [[ $# -eq 1 ]]; then + MESSAGE=$1 +elif [[ $# -eq 2 ]]; then + CHAT_ID=$1 + MESSAGE=$2 +else + echo "[ERROR] Too many arguments!" +fi + +# Exit if input is empty +if [[ -z "${MESSAGE}" ]]; then + exit 0 +fi + +# Strip timestamps from message +if [ "${MESSAGE:0:12}" == "$(echo '' | ts "[%Y-%m-%d")" ]; then + MESSAGE=$(echo -e "$MESSAGE" | cut -c 23-) +fi + +# Replace newlines in message for telegram +TG_MESSAGE=${MESSAGE//$'\n'/\%0A} + +# Send telegram to chat +resp=$(curl -s -f -X POST ${BOT_API_URL}/sendMessage -d chat_id=$CHAT_ID -d text="⚠️ *$(hostname | tr . ' ')* ⚠️%0A\`\`\`%0A${TG_MESSAGE}%0A\`\`\`" -d parse_mode=markdown) + +# Check if request succeeded +if [[ $? -ne 0 ]]; then + echo "[ERROR] Telegram request failed!" +else + if [ "${resp:1:9}" == "\"ok\":true" ]; then + # echo "Sent Telegram: \n${MESSAGE//$'\n'/\n}" + else + echo "[ERROR] Telegram sending did not succeed: $resp" + echo "MESSAGE: \n${MESSAGE//$'\n'/\n}" + fi +fi diff --git a/zfs_health_check.sh b/zfs_health_check.sh new file mode 100755 index 0000000..6b5f676 --- /dev/null +++ b/zfs_health_check.sh @@ -0,0 +1,127 @@ +#!/bin/bash + +################################################################################ +# ZFS_HEALTH_CHECK.SH +# ------------------- +# This script checks the ZFS health and reports any problems on stdout +# +# Inspired by https://gist.github.com/petervanderdoes/bd6660302404ed5b094d +# +# Author: Robin Meier - robin@meier.si +################################################################################ + +logfile=/root/logs/zfs_health_check.log +log_identifier="[ZFS]" +log() { + echo -e $@ | ts "[%Y-%m-%d %H:%M:%S] $log_identifier" >> $logfile +} +log_echo() { + echo -e $@ | ts "[%Y-%m-%d %H:%M:%S] $log_identifier" | tee -a $logfile +} + +problems=0 + +set -o allexport +source /root/scripts/.zfs_health_check_env +set +o allexport + +log "Starting ZFS Health Check" + +# Pool Status +zpool_status=$(/sbin/zpool status | egrep -i '(DEGRADED|FAULTED|OFFLINE|UNAVAIL|REMOVED|FAIL|DESTROYED|corrupt|cannot|unrecover)') +if [ "${zpool_status}" ]; then + log_echo "[ERROR] !!! BAD ZFS HEALTH !!!" + /sbin/zpool status | ts "[%Y-%m-%d %H:%M:%S] $log_identifier" | tee -a $logfile + problems=1 +fi + +# Capacity +maxCapacity=50 +if [ ${problems} -eq 0 ]; then + capacity=$(/sbin/zpool list -H -o capacity) + for line in ${capacity//%/} + do + if [ $line -ge $maxCapacity ]; then + problems=1 + fi + done + if [ ${problems} -eq 1 ]; then + log_echo "[ERROR] !!! BAD ZFS CAPACITY !!!" + /sbin/zpool list -o name,cap,free | ts "[%Y-%m-%d %H:%M:%S] $log_identifier" | tee -a $logfile + fi +fi + +# Errors - Check the columns for READ, WRITE and CKSUM (checksum) drive errors +# on all volumes and all drives using "zpool status". If any non-zero errors +# are reported an email will be sent out. You should then look to replace the +# faulty drive and run "zpool scrub" on the affected volume after resilvering. +if [ ${problems} -eq 0 ]; then + errors=$(/sbin/zpool status | grep ONLINE | grep -v state | awk '{print $3 $4 $5}' | grep -v 000) + if [ "${errors}" ]; then + log_echo "[ERROR] !!! ZFS ERRORS FOUND !!!" + /sbin/zpool status | ts "[%Y-%m-%d %H:%M:%S] $log_identifier" | tee -a $logfile + log_echo "[ERROR] You shoud replace the faulty drive and run \"zpool scrub\" after resilvering" + problems=1 + fi +fi + +# Scrub Expired - Check if all volumes have been scrubbed in at least the last +# 8 days. The general guide is to scrub volumes on desktop quality drives once +# a week and volumes on enterprise class drives once a month. You can always +# use cron to schedule "zpool scrub" in off hours. We scrub our volumes every +# Sunday morning for example. +# +# Scrubbing traverses all the data in the pool once and verifies all blocks can +# be read. Scrubbing proceeds as fast as the devices allows, though the +# priority of any I/O remains below that of normal calls. This operation might +# negatively impact performance, but the file system will remain usable and +# responsive while scrubbing occurs. To initiate an explicit scrub, use the +# "zpool scrub" command. +# +# The scrubExpire variable is in seconds. So for 8 days we calculate 8 days +# times 24 hours times 3600 seconds to equal 691200 seconds. +# scrubExpire=691200 +scrubExpire=1382400 +if [ ${problems} -eq 0 ]; then + currentDate=$(date +%s) + zfsVolumes=$(/sbin/zpool list -H -o name) + for volume in ${zfsVolumes} + do + if [ $(/sbin/zpool status $volume | egrep -c "none requested") -ge 1 ]; then + log_echo "ERROR: You need to run \"zpool scrub $volume\" before this script can monitor the scrub expiration time." + break + fi + if [ $(/sbin/zpool status $volume | egrep -c "scrub in progress|resilver") -ge 1 ]; then + break + fi + scrubRawDate=$(/sbin/zpool status $volume | grep scrub | awk '{print $11" "$12" " $13" " $14" "$15}') + scrubDate=$(date -d "$scrubRawDate" +%s) + if [ $(($currentDate - $scrubDate)) -ge $scrubExpire ]; then + problems=1 + log_echo "[ERROR] Pool: $volume needs scrub!" + fi + done +fi + +# Unmounted datasets +unmounted=$(/sbin/zfs list -o mounted | grep no) +if [ "${unmounted}" ]; then + log "[WARN] THERE ARE UNMOUNTED DATASETS" +fi + +# Warnable unmounted datasets +for dataset in $MONITORED_DATASETS +do + unmounted=$(/sbin/zfs list $dataset -o mounted | grep no) + if [ "${unmounted}" ]; then + log_echo "[MON] Monitored dataset $dataset is not mounted" + problems=1 + fi +done + +# Finish +if [ ${problems} -eq 0 ]; then + log "ZFS Health Check Successful" +else + log_echo "ZFS Health Check Found Problems" +fi