Skip to main content

Kiwiplan - Server Load Monitor

This script is based on this initial loadmon script

This is to monitor specific objects at a Advantive/Kiwiplan sites

It will send something like this for PCS

image.png

#!/bin/bash
############################################
#
# Author: Steve Ling 5/2/25
# Modified:
# Added PCS Scheduler to the monitorying 8/14/25
#
# Purpose: Monitor system metrics including:
# - Average Load
# - CPU Usage
# - Disk Space
# - Memory Usage
# - MySQL Table Record Count
#
# Installation:
# 1. Copy to /opt/scripts/
# 2. Set permissions: chmod 755 /opt/scripts/loadmon.sh
# 3. Set ownership: chown remuser:kiwiplan /opt/scripts/loadmon.sh
# 4. Add cron job as root to run every 5 minutes:
#    */5 * * * * /opt/scripts/loadmon.sh >/dev/null 2>&1 | logger
#
# MySQL Configuration:
# Set the following environment variables:
# - MYSQL_HOST: MySQL host (default: localhost)
# - MYSQL_USER: MySQL username
# - MYSQL_PASSWORD: MySQL password
# - MYSQL_DATABASE: Database name
# - MYSQL_TABLE: Table to monitor
# - MYSQL_RECORD_THRESHOLD: Record count threshold (default: 100000)
#
# Improvements:
# - Added MySQL table record count check
# - Added error handling for MySQL commands
# - Improved logging with timestamps
# - Made thresholds configurable via environment variables
# - Added hostname to alerts for clarity
# - Replaced mutt with mail (more common)
# - Optimized command execution
# - Added input validation
#
############################################

# Exit on any error
set -e

# Configuration (can be overridden via environment variables)
: "${LOAD_THRESHOLD:=10.00}"       # Load average threshold
: "${DISK_THRESHOLD:=85}"          # Disk usage threshold (%)
: "${CPU_THRESHOLD:=65}"           # CPU usage threshold (%)
: "${MEM_THRESHOLD:=85}"           # Memory usage threshold (%)
: "${MYSQL_RECORD_THRESHOLD:=100}" # MySQL table record count threshold
: "${RECIPIENTS:=steve.ling@sflservicesllc.com}" # Space-separated email addresses
: "${HOSTNAME:=$(hostname -s)}"    # Short hostname for alerts
: "${LOG_FILE:=/var/log/loadmon.log}" # Log file location
: "${MYSQL_HOST:=localhost}"       # MySQL host
: "${MYSQL_USER:=}"                # MySQL username
: "${MYSQL_PASSWORD:=}"            # MySQL password
: "${MYSQL_DATABASE_PREFIX:=}"     # MySQL database prefix

# Ensure required commands are available
for cmd in awk df top free mail logger mysql; do
    if ! command -v "$cmd" &>/dev/null; then
        echo "Error: Required command '$cmd' not found" | logger -t loadmon
        exit 1
    fi
done

# Function to log messages with timestamp
log_message() {
    local message="$1"
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $message" | logger -t loadmon
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $message" >> "$LOG_FILE"
}

# Function to send email alerts
send_alert() {
    local subject="$1"
    local body="$2"
    if echo -e "$body" | mail -s "$subject" $RECIPIENTS 2>/dev/null; then
        log_message "Alert sent: $subject"
    else
        log_message "Error: Failed to send alert: $subject"
    fi
}

# Collect system metrics
load=$(awk '{print $1}' /proc/loadavg 2>/dev/null || log_message "Error: Failed to read load average")
disk_usage=$(df / | awk 'NR==2 {print $5}' | sed 's/%//' 2>/dev/null || log_message "Error: Failed to read disk usage")
cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print 100 - $8}' | cut -d. -f1 2>/dev/null || log_message "Error: Failed to read CPU usage")
mem_usage=$(free | awk '/Mem:/ {printf "%.0f", $3/$2 * 100}' 2>/dev/null || log_message "Error: Failed to read memory usage")

# Collect PCS Secduler record count (if configured)
mysql_record_count=0
if [ -n "$MYSQL_USER" ] && [ -n "$MYSQL_DATABASE_PREFIX" ]; then
    mysql_record_count=$(mysql -h "$MYSQL_HOST" -u "$MYSQL_USER" -p"$MYSQL_PASSWORD" -D "$MYSQL_DATABASE" -N -e "USE ${MYSQL_DATABASE_PREFIX}_pcs;SELECT COUNT(*) FROM schedulemachineavailability WHERE pcsSchedule NOT IN (SELECT pcsSchedule FROM schedulerun)" 2>/dev/null) || {
        log_message "Error: Failed to query MySQL table $MYSQL_DATABASE.$MYSQL_TABLE"
        mysql_record_count=0
    }
else
    log_message "Warning: PCS Sceduler configuration incomplete (USER or DATABASE missing)"
fi

# Validate collected metrics
if ! [[ "$load" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
    log_message "Error: Invalid load value: $load"
    exit 1
fi
if ! [[ "$disk_usage" =~ ^[0-9]+$ ]]; then
    log_message "Error: Invalid disk usage value: $disk_usage"
    exit 1
fi
if ! [[ "$cpu_usage" =~ ^[0-9]+$ ]]; then
    log_message "Error: Invalid CPU usage value: $cpu_usage"
    exit 1
fi
if ! [[ "$mem_usage" =~ ^[0-9]+$ ]]; then
    log_message "Error: Invalid memory usage value: $mem_usage"
    exit 1
fi
if ! [[ "$mysql_record_count" =~ ^[0-9]+$ ]]; then
    log_message "Error: Invalid MySQL record count: $mysql_record_count"
    mysql_record_count=0
fi

# Check thresholds and send alerts
if (( $(echo "$load > $LOAD_THRESHOLD" | bc -l) )); then
    body=$(sar -q 2>/dev/null || echo "Error collecting sar data")
    send_alert "High load on $HOSTNAME - [ $load ]" "$body"
fi

if (( disk_usage > DISK_THRESHOLD )); then
    body=$(df -h / 2>/dev/null || echo "Error collecting df data")
    send_alert "High disk usage on $HOSTNAME - [ ${disk_usage}% ]" "$body"
fi

if (( cpu_usage > CPU_THRESHOLD )); then
    body=$(top -bn1 | head -n 12 2>/dev/null || echo "Error collecting top data")
    send_alert "High CPU usage on $HOSTNAME - [ ${cpu_usage}% ]" "$body"
fi

if (( mem_usage > MEM_THRESHOLD )); then
    body=$(free -h 2>/dev/null || echo "Error collecting free data")
    send_alert "High memory usage on $HOSTNAME - [ ${mem_usage}% ]" "$body"
fi

if [ "$mysql_record_count" -gt "$MYSQL_RECORD_THRESHOLD" ]; then
    body="PCS VUE Scheduler has an issue\nTable schedulemachineavailability has $mysql_record_count\n\nWhich is more records then the number of ran schedules in the schedulerun table"
    send_alert "PCS VUE Scheduler record count on $HOSTNAME - [ $mysql_record_count ]" "$body"
fi

log_message "Monitoring completed: Load=$load, Disk=${disk_usage}%, CPU=${cpu_usage}%, Mem=${mem_usage}%, PCS Scheduler Records=$mysql_record_count"