Linux - Server Load Monitor Script
This is to monitor various objects on a given server and email notifications out when thresholds are exceeded
IT will send out something like this
#!/bin/bash
############################################
#
# Author: Steve Ling 8/2/25
#
# Purpose: Monitor system metrics including:
# - Average Load
# - CPU Usage
# - Disk Space
# - Memory Usage
#
# Installation:
# 1. Copy to /opt/scripts/
# 2. Set permissions: chmod 755 /opt/scripts/loadmon.sh
# 3. Set ownership: chown remuser:kiwiplan /opt/scripts/loadmon.sh
# 4. Add cron job as root to run every 5 minutes:
# */5 * * * * /opt/scripts/loadmon.sh >/dev/null 2>&1 | logger
#
# Improvements:
# - Added error handling for commands and email sending
# - Improved logging with timestamps
# - Made thresholds configurable via environment variables
# - Added hostname to alerts for clarity
# - Replaced mutt with mail (more common)
# - Optimized command execution
# - Added input validation
#
############################################
# Exit on any error
set -e
# Configuration (can be overridden via environment variables)
: "${LOAD_THRESHOLD:=10.00}" # Load average threshold
: "${DISK_THRESHOLD:=85}" # Disk usage threshold (%)
: "${CPU_THRESHOLD:=65}" # CPU usage threshold (%)
: "${MEM_THRESHOLD:=85}" # Memory usage threshold (%)
: "${RECIPIENTS:=steve.ling@sflservicesllc.com}" # Space-separated email addresses
: "${HOSTNAME:=$(hostname -s)}" # Short hostname for alerts
: "${LOG_FILE:=/var/log/loadmon.log}" # Log file location
# Ensure required commands are available
for cmd in awk df top free mail logger; do
if ! command -v "$cmd" &>/dev/null; then
echo "Error: Required command '$cmd' not found" | logger -t loadmon
exit 1
fi
done
# Function to log messages with timestamp
log_message() {
local message="$1"
echo "$(date '+%Y-%m-%d %H:%M:%S') - $message" | logger -t loadmon
echo "$(date '+%Y-%m-%d %H:%M:%S') - $message" >> "$LOG_FILE"
}
# Function to send email alerts
send_alert() {
local subject="$1"
local body="$2"
if echo -e "$body" | mail -s "$subject" $RECIPIENTS 2>/dev/null; then
log_message "Alert sent: $subject"
else
log_message "Error: Failed to send alert: $subject"
fi
}
# Collect system metrics
load=$(awk '{print $1}' /proc/loadavg 2>/dev/null || log_message "Error: Failed to read load average")
disk_usage=$(df / | awk 'NR==2 {print $5}' | sed 's/%//' 2>/dev/null || log_message "Error: Failed to read disk usage")
cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print 100 - $8}' | cut -d. -f1 2>/dev/null || log_message "Error: Failed to read CPU usage")
mem_usage=$(free | awk '/Mem:/ {printf "%.0f", $3/$2 * 100}' 2>/dev/null || log_message "Error: Failed to read memory usage")
# Validate collected metrics
if ! [[ "$load" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
log_message "Error: Invalid load value: $load"
exit 1
fi
if ! [[ "$disk_usage" =~ ^[0-9]+$ ]]; then
log_message "Error: Invalid disk usage value: $disk_usage"
exit 1
fi
if ! [[ "$cpu_usage" =~ ^[0-9]+$ ]]; then
log_message "Error: Invalid CPU usage value: $cpu_usage"
exit 1
fi
if ! [[ "$mem_usage" =~ ^[0-9]+$ ]]; then
log_message "Error: Invalid memory usage value: $mem_usage"
exit 1
fi
# Check thresholds and send alerts
if (( $(echo "$load > $LOAD_THRESHOLD" | bc -l) )); then
body=$(sar -q 2>/dev/null || echo "Error collecting sar data")
send_alert "High load on $HOSTNAME - [ $load ]" "$body"
fi
if (( disk_usage > DISK_THRESHOLD )); then
body=$(df -h / 2>/dev/null || echo "Error collecting df data")
send_alert "High disk usage on $HOSTNAME - [ ${disk_usage}% ]" "$body"
fi
if (( cpu_usage > CPU_THRESHOLD )); then
body=$(top -bn1 | head -n 12 2>/dev/null || echo "Error collecting top data")
send_alert "High CPU usage on $HOSTNAME - [ ${cpu_usage}% ]" "$body"
fi
if (( mem_usage > MEM_THRESHOLD )); then
body=$(free -h 2>/dev/null || echo "Error collecting free data")
send_alert "High memory usage on $HOSTNAME - [ ${mem_usage}% ]" "$body"
fi
log_message "Monitoring completed: Load=$load, Disk=${disk_usage}%, CPU=${cpu_usage}%, Mem=${mem_usage}%"
.