Shell Scripting Series Part 2 — Real-World Automation Scripts
6 scripts you can use in production today — copy, customize, deploy. In Part 1, we covered the fundamentals of shell scripting. Now it's time to build real scripts that solve real problems.
Script 1: Server Health Check
This script checks CPU, memory, disk, and services — perfect for a quick status overview or as a cron job that emails you.
#!/bin/bash
# server-health-check.sh — Comprehensive server health report
# Usage: ./server-health-check.sh [--alert-only]
set -euo pipefail
ALERT_ONLY="${1:-}"
CPU_THRESHOLD=80
MEM_THRESHOLD=80
DISK_THRESHOLD=85
CRITICAL_SERVICES=("nginx" "postgresql" "redis-server")
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
NC='\033[0m'
alerts=0
print_status() {
local label="$1" value="$2" threshold="$3" unit="${4:-%}"
if [[ "$value" -ge "$threshold" ]]; then
echo -e " ${RED}[CRITICAL]${NC} $label: ${value}${unit} (threshold: ${threshold}${unit})"
((alerts++))
elif [[ "$value" -ge $((threshold - 10)) ]]; then
echo -e " ${YELLOW}[WARNING]${NC} $label: ${value}${unit}"
else
[[ -z "$ALERT_ONLY" ]] && echo -e " ${GREEN}[OK]${NC} $label: ${value}${unit}"
fi
}
echo "=== Server Health Check — $(hostname) — $(date) ==="
echo ""
# CPU Usage
echo "--- CPU ---"
cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print int($2 + $4)}')
print_status "CPU Usage" "$cpu_usage" "$CPU_THRESHOLD"
# Load Average
load_1m=$(uptime | awk -F'load average:' '{print $2}' | awk -F, '{print $1}' | xargs)
cores=$(nproc)
[[ -z "$ALERT_ONLY" ]] && echo " Load: $load_1m (cores: $cores)"
# Memory
echo ""
echo "--- Memory ---"
mem_usage=$(free | awk '/Mem:/ {printf "%d", $3/$2 * 100}')
mem_total=$(free -h | awk '/Mem:/ {print $2}')
mem_used=$(free -h | awk '/Mem:/ {print $3}')
print_status "Memory" "$mem_usage" "$MEM_THRESHOLD"
[[ -z "$ALERT_ONLY" ]] && echo " Used: $mem_used / $mem_total"
# Swap
swap_total=$(free | awk '/Swap:/ {print $2}')
if [[ "$swap_total" -gt 0 ]]; then
swap_usage=$(free | awk '/Swap:/ {printf "%d", $3/$2 * 100}')
print_status "Swap" "$swap_usage" 50
fi
# Disk
echo ""
echo "--- Disk ---"
while read -r usage mount; do
usage_num=${usage%\%}
print_status "Disk ($mount)" "$usage_num" "$DISK_THRESHOLD"
done < <(df -h --output=pcent,target -x tmpfs -x devtmpfs | tail -n +2 | awk '{print $1, $2}' | tr -d '%' | awk '{print $1"%", $2}')
# Services
echo ""
echo "--- Services ---"
for svc in "${CRITICAL_SERVICES[@]}"; do
if systemctl is-active --quiet "$svc" 2>/dev/null; then
[[ -z "$ALERT_ONLY" ]] && echo -e " ${GREEN}[OK]${NC} $svc is running"
else
echo -e " ${RED}[CRITICAL]${NC} $svc is NOT running"
((alerts++))
fi
done
# Summary
echo ""
echo "================================"
if [[ "$alerts" -gt 0 ]]; then
echo -e "${RED}$alerts alert(s) found!${NC}"
exit 1
else
echo -e "${GREEN}All checks passed.${NC}"
exit 0
fi
Script 2: Log Cleanup
Old logs eat disk space silently. This script finds and cleans them safely.
#!/bin/bash
# log-cleanup.sh — Clean old log files with safety checks
# Usage: ./log-cleanup.sh [--dry-run]
set -euo pipefail
DRY_RUN="${1:-}"
LOG_DIRS=("/var/log" "/opt/apps/*/logs")
MAX_AGE_DAYS=30
MAX_SIZE_MB=500
TOTAL_FREED=0
log() { echo "[$(date '+%H:%M:%S')] $1"; }
cleanup_old_logs() {
local dir="$1"
[[ -d "$dir" ]] || return 0
log "Scanning $dir for logs older than ${MAX_AGE_DAYS} days..."
while IFS= read -r -d '' file; do
local size
size=$(stat --format="%s" "$file")
local size_mb=$((size / 1024 / 1024))
if [[ "$DRY_RUN" == "--dry-run" ]]; then
log " [DRY RUN] Would delete: $file (${size_mb}MB)"
else
rm -f "$file"
log " Deleted: $file (${size_mb}MB)"
fi
TOTAL_FREED=$((TOTAL_FREED + size_mb))
done < <(find "$dir" -name "*.log.gz" -mtime +${MAX_AGE_DAYS} -print0 2>/dev/null)
# Also find uncompressed logs older than max age
while IFS= read -r -d '' file; do
local size
size=$(stat --format="%s" "$file")
local size_mb=$((size / 1024 / 1024))
if [[ "$DRY_RUN" == "--dry-run" ]]; then
log " [DRY RUN] Would delete: $file (${size_mb}MB)"
else
rm -f "$file"
log " Deleted: $file (${size_mb}MB)"
fi
TOTAL_FREED=$((TOTAL_FREED + size_mb))
done < <(find "$dir" -name "*.log.[0-9]*" -mtime +${MAX_AGE_DAYS} -print0 2>/dev/null)
}
truncate_large_logs() {
local dir="$1"
[[ -d "$dir" ]] || return 0
log "Checking for active logs larger than ${MAX_SIZE_MB}MB..."
while IFS= read -r -d '' file; do
local size_mb
size_mb=$(stat --format="%s" "$file")
size_mb=$((size_mb / 1024 / 1024))
if [[ "$DRY_RUN" == "--dry-run" ]]; then
log " [DRY RUN] Would truncate: $file (${size_mb}MB)"
else
# Keep last 1000 lines, truncate the rest
tail -1000 "$file" > "${file}.tmp" && mv "${file}.tmp" "$file"
log " Truncated: $file (was ${size_mb}MB)"
fi
done < <(find "$dir" -name "*.log" -size +${MAX_SIZE_MB}M -print0 2>/dev/null)
}
# Clean journal logs
clean_journal() {
log "Cleaning systemd journal (keeping 1 week)..."
if [[ "$DRY_RUN" != "--dry-run" ]]; then
sudo journalctl --vacuum-time=1weeks 2>&1 | tail -1
else
local journal_size
journal_size=$(journalctl --disk-usage 2>&1 | awk '{print $7, $8}')
log " [DRY RUN] Journal currently using: $journal_size"
fi
}
log "=== Log Cleanup Started ==="
[[ "$DRY_RUN" == "--dry-run" ]] && log "*** DRY RUN MODE — no files will be modified ***"
for dir in ${LOG_DIRS[@]}; do
cleanup_old_logs "$dir"
truncate_large_logs "$dir"
done
clean_journal
log "=== Cleanup Complete — ~${TOTAL_FREED}MB freed ==="
Script 3: Backup Automation
A reliable backup script with compression, rotation, and verification.
#!/bin/bash
# backup.sh — Automated backup with rotation
# Usage: ./backup.sh /path/to/source [backup-name]
set -euo pipefail
SOURCE="${1:?Usage: $0 /path/to/source [backup-name]}"
BACKUP_NAME="${2:-$(basename "$SOURCE")}"
BACKUP_BASE="/backups"
BACKUP_DIR="${BACKUP_BASE}/${BACKUP_NAME}"
KEEP_DAYS=7
TIMESTAMP=$(date '+%Y%m%d_%H%M%S')
BACKUP_FILE="${BACKUP_DIR}/${BACKUP_NAME}_${TIMESTAMP}.tar.gz"
log() { echo "[$(date '+%H:%M:%S')] $1"; }
# Create backup directory
mkdir -p "$BACKUP_DIR"
# Pre-flight checks
if [[ ! -d "$SOURCE" ]]; then
log "ERROR: Source directory does not exist: $SOURCE"
exit 1
fi
source_size=$(du -sh "$SOURCE" | awk '{print $1}')
available_space=$(df -h "$BACKUP_BASE" | awk 'NR==2 {print $4}')
log "Source size: $source_size | Available space: $available_space"
# Create the backup
log "Creating backup: $BACKUP_FILE"
tar -czf "$BACKUP_FILE" -C "$(dirname "$SOURCE")" "$(basename "$SOURCE")" 2>/dev/null
# Verify the backup
backup_size=$(du -sh "$BACKUP_FILE" | awk '{print $1}')
file_count=$(tar -tzf "$BACKUP_FILE" | wc -l)
log "Backup created: $backup_size ($file_count files)"
# Verify integrity
if tar -tzf "$BACKUP_FILE" > /dev/null 2>&1; then
log "Integrity check: PASSED"
else
log "ERROR: Backup integrity check FAILED!"
rm -f "$BACKUP_FILE"
exit 1
fi
# Generate checksum
sha256sum "$BACKUP_FILE" > "${BACKUP_FILE}.sha256"
log "Checksum saved: ${BACKUP_FILE}.sha256"
# Rotate old backups
log "Removing backups older than $KEEP_DAYS days..."
deleted_count=0
while IFS= read -r -d '' old_backup; do
rm -f "$old_backup" "${old_backup}.sha256"
((deleted_count++))
done < <(find "$BACKUP_DIR" -name "*.tar.gz" -mtime +${KEEP_DAYS} -print0)
log "Removed $deleted_count old backup(s)"
# Summary
log "=== Backup Complete ==="
log " File: $BACKUP_FILE"
log " Size: $backup_size"
log " Files: $file_count"
ls -lh "$BACKUP_DIR"/ | tail -5
Script 4: Deployment Script
A structured deployment script with rollback capability.
#!/bin/bash
# deploy.sh — Deploy application with rollback support
# Usage: ./deploy.sh <environment> <version>
set -euo pipefail
ENV="${1:?Usage: $0 <environment> <version>}"
VERSION="${2:?Usage: $0 <environment> <version>}"
APP_DIR="/opt/myapp"
RELEASES_DIR="${APP_DIR}/releases"
CURRENT_LINK="${APP_DIR}/current"
REPO_URL="https://github.com/myorg/myapp.git"
log() { echo "[$(date '+%H:%M:%S')] [$ENV] $1"; }
rollback() {
log "ERROR: Deployment failed! Rolling back..."
if [[ -L "$CURRENT_LINK" ]]; then
local current_target
current_target=$(readlink "$CURRENT_LINK")
log "Current version still active: $current_target"
fi
# Clean up failed release
[[ -d "${RELEASES_DIR}/${VERSION}" ]] && rm -rf "${RELEASES_DIR}/${VERSION}"
log "Rollback complete"
exit 1
}
trap rollback ERR
# Validate environment
case "$ENV" in
dev|staging|production) ;;
*) log "Invalid environment: $ENV"; exit 1 ;;
esac
log "Starting deployment: version $VERSION to $ENV"
# Create release directory
mkdir -p "${RELEASES_DIR}/${VERSION}"
# Download/clone the release
log "Fetching version $VERSION..."
git clone --depth 1 --branch "$VERSION" "$REPO_URL" "${RELEASES_DIR}/${VERSION}" 2>/dev/null
# Install dependencies
log "Installing dependencies..."
cd "${RELEASES_DIR}/${VERSION}"
npm ci --production 2>/dev/null
# Run tests (skip in dev)
if [[ "$ENV" != "dev" ]]; then
log "Running tests..."
npm test 2>/dev/null
fi
# Switch the symlink (atomic deployment)
log "Switching to new version..."
ln -sfn "${RELEASES_DIR}/${VERSION}" "$CURRENT_LINK"
# Restart the service
log "Restarting application..."
sudo systemctl restart myapp
# Health check
log "Running health check..."
sleep 3
if curl -sf http://localhost:3000/health > /dev/null; then
log "Health check PASSED"
else
log "Health check FAILED"
rollback
fi
# Clean old releases (keep last 5)
log "Cleaning old releases..."
ls -dt "${RELEASES_DIR}"/*/ | tail -n +6 | xargs rm -rf 2>/dev/null || true
log "=== Deployment Complete: $VERSION on $ENV ==="
Script 5: Disk Space Alert
Monitor disk space and send alerts before things break.
#!/bin/bash
# disk-alert.sh — Monitor disk space and alert
# Run via cron: */15 * * * * /opt/scripts/disk-alert.sh
set -euo pipefail
WARNING_THRESHOLD=75
CRITICAL_THRESHOLD=90
ALERT_FILE="/tmp/disk-alert-sent"
HOSTNAME=$(hostname)
send_alert() {
local level="$1" mount="$2" usage="$3"
local message="[$level] $HOSTNAME: $mount is ${usage}% full"
# Log to syslog
logger -t disk-alert "$message"
# Send to Slack webhook (replace URL with yours)
if [[ -n "${SLACK_WEBHOOK:-}" ]]; then
local color="warning"
[[ "$level" == "CRITICAL" ]] && color="danger"
curl -s -X POST "$SLACK_WEBHOOK" \
-H 'Content-type: application/json' \
-d "{\"attachments\":[{\"color\":\"$color\",\"text\":\"$message\"}]}" \
> /dev/null 2>&1
fi
echo "$message"
}
# Check each filesystem
while read -r usage mount; do
usage_num=${usage%\%}
if [[ "$usage_num" -ge "$CRITICAL_THRESHOLD" ]]; then
# Only alert once per mount per hour
alert_key="${ALERT_FILE}-critical-$(echo "$mount" | tr '/' '_')"
if [[ ! -f "$alert_key" ]] || [[ $(find "$alert_key" -mmin +60 2>/dev/null) ]]; then
send_alert "CRITICAL" "$mount" "$usage_num"
touch "$alert_key"
fi
elif [[ "$usage_num" -ge "$WARNING_THRESHOLD" ]]; then
alert_key="${ALERT_FILE}-warning-$(echo "$mount" | tr '/' '_')"
if [[ ! -f "$alert_key" ]] || [[ $(find "$alert_key" -mmin +240 2>/dev/null) ]]; then
send_alert "WARNING" "$mount" "$usage_num"
touch "$alert_key"
fi
fi
done < <(df --output=pcent,target -x tmpfs -x devtmpfs | tail -n +2 | awk '{gsub(/%/,"",$1); print $1"%", $2}')
Script 6: User Management
Automate user provisioning from a CSV file.
#!/bin/bash
# manage-users.sh — Bulk user management from CSV
# Usage: ./manage-users.sh users.csv
# CSV format: username,fullname,group,ssh_key_url
set -euo pipefail
CSV_FILE="${1:?Usage: $0 users.csv}"
log() { echo "[$(date '+%H:%M:%S')] $1"; }
create_user() {
local username="$1" fullname="$2" group="$3" ssh_key="$4"
# Create group if it doesn't exist
if ! getent group "$group" > /dev/null 2>&1; then
sudo groupadd "$group"
log "Created group: $group"
fi
# Create user if they don't exist
if id "$username" > /dev/null 2>&1; then
log "User already exists: $username — skipping"
return 0
fi
sudo useradd -m -c "$fullname" -g "$group" -s /bin/bash "$username"
log "Created user: $username ($fullname) in group $group"
# Set up SSH key if provided
if [[ -n "$ssh_key" && "$ssh_key" != "none" ]]; then
local ssh_dir="/home/${username}/.ssh"
sudo mkdir -p "$ssh_dir"
echo "$ssh_key" | sudo tee "${ssh_dir}/authorized_keys" > /dev/null
sudo chmod 700 "$ssh_dir"
sudo chmod 600 "${ssh_dir}/authorized_keys"
sudo chown -R "${username}:${group}" "$ssh_dir"
log " SSH key configured for $username"
fi
# Force password change on first login
sudo passwd -e "$username" > /dev/null 2>&1
log " Password reset required on first login for $username"
}
# Validate CSV exists
if [[ ! -f "$CSV_FILE" ]]; then
log "ERROR: File not found: $CSV_FILE"
exit 1
fi
log "=== User Management Started ==="
# Read CSV, skip header line
tail -n +2 "$CSV_FILE" | while IFS=',' read -r username fullname group ssh_key; do
# Skip empty lines and comments
[[ -z "$username" || "$username" == \#* ]] && continue
create_user "$username" "$fullname" "$group" "$ssh_key"
done
log "=== User Management Complete ==="
log "Current users:"
awk -F: '$3 >= 1000 && $3 < 65534 {print " "$1" ("$5") — "$7}' /etc/passwd
This is Part 2 of our Shell Scripting series. Catch up: Part 1 — Variables, Loops, and Functions. Next: Part 3 — Error Handling, Logging, and Production Scripts where we'll make these scripts bulletproof.
