Skip to main content

Linux - Server Load Monitor Script

This is to monitor various objects on a given server and email notifications out when thresholds are exceeded

IT will send out something like this

image.png

#!/bin/bash
############################################
#
# Author: Steve Ling 8/2/25
#
# Purpose: Monitor system metrics including:
# - Average Load
# - CPU Usage
# - Disk Space
# - Memory Usage
#
# Installation:
# 1. Copy to /opt/scripts/
# 2. Set permissions: chmod 755 /opt/scripts/loadmon.sh
# 3. Set ownership: chown remuser:kiwiplan /opt/scripts/loadmon.sh
# 4. Add cron job as root to run every 5 minutes:
#    */5 * * * * /opt/scripts/loadmon.sh >/dev/null 2>&1 | logger
#
# Improvements:
# - Added error handling for commands and email sending
# - Improved logging with timestamps
# - Made thresholds configurable via environment variables
# - Added hostname to alerts for clarity
# - Replaced mutt with mail (more common)
# - Optimized command execution
# - Added input validation
#
############################################

# Exit on any error
set -e

# Configuration (can be overridden via environment variables)
: "${LOAD_THRESHOLD:=10.00}"       # Load average threshold
: "${DISK_THRESHOLD:=85}"          # Disk usage threshold (%)
: "${CPU_THRESHOLD:=65}"           # CPU usage threshold (%)
: "${MEM_THRESHOLD:=85}"           # Memory usage threshold (%)
: "${RECIPIENTS:=steve.ling@sflservicesllc.com}" # Space-separated email addresses
: "${HOSTNAME:=$(hostname -s)}"    # Short hostname for alerts
: "${LOG_FILE:=/var/log/loadmon.log}" # Log file location

# Ensure required commands are available
for cmd in awk df top free mail logger; do
    if ! command -v "$cmd" &>/dev/null; then
        echo "Error: Required command '$cmd' not found" | logger -t loadmon
        exit 1
    fi
done

# Function to log messages with timestamp
log_message() {
    local message="$1"
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $message" | logger -t loadmon
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $message" >> "$LOG_FILE"
}

# Function to send email alerts
send_alert() {
    local subject="$1"
    local body="$2"
    if echo -e "$body" | mail -s "$subject" $RECIPIENTS 2>/dev/null; then
        log_message "Alert sent: $subject"
    else
        log_message "Error: Failed to send alert: $subject"
    fi
}

# Collect system metrics
load=$(awk '{print $1}' /proc/loadavg 2>/dev/null || log_message "Error: Failed to read load average")
disk_usage=$(df / | awk 'NR==2 {print $5}' | sed 's/%//' 2>/dev/null || log_message "Error: Failed to read disk usage")
cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print 100 - $8}' | cut -d. -f1 2>/dev/null || log_message "Error: Failed to read CPU usage")
mem_usage=$(free | awk '/Mem:/ {printf "%.0f", $3/$2 * 100}' 2>/dev/null || log_message "Error: Failed to read memory usage")

# Validate collected metrics
if ! [[ "$load" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
    log_message "Error: Invalid load value: $load"
    exit 1
fi
if ! [[ "$disk_usage" =~ ^[0-9]+$ ]]; then
    log_message "Error: Invalid disk usage value: $disk_usage"
    exit 1
fi
if ! [[ "$cpu_usage" =~ ^[0-9]+$ ]]; then
    log_message "Error: Invalid CPU usage value: $cpu_usage"
    exit 1
fi
if ! [[ "$mem_usage" =~ ^[0-9]+$ ]]; then
    log_message "Error: Invalid memory usage value: $mem_usage"
    exit 1
fi

# Check thresholds and send alerts
if (( $(echo "$load > $LOAD_THRESHOLD" | bc -l) )); then
    body=$(sar -q 2>/dev/null || echo "Error collecting sar data")
    send_alert "High load on $HOSTNAME - [ $load ]" "$body"
fi

if (( disk_usage > DISK_THRESHOLD )); then
    body=$(df -h / 2>/dev/null || echo "Error collecting df data")
    send_alert "High disk usage on $HOSTNAME - [ ${disk_usage}% ]" "$body"
fi

if (( cpu_usage > CPU_THRESHOLD )); then
    body=$(top -bn1 | head -n 12 2>/dev/null || echo "Error collecting top data")
    send_alert "High CPU usage on $HOSTNAME - [ ${cpu_usage}% ]" "$body"
fi

if (( mem_usage > MEM_THRESHOLD )); then
    body=$(free -h 2>/dev/null || echo "Error collecting free data")
    send_alert "High memory usage on $HOSTNAME - [ ${mem_usage}% ]" "$body"
fi

log_message "Monitoring completed: Load=$load, Disk=${disk_usage}%, CPU=${cpu_usage}%, Mem=${mem_usage}%"

.