1. 服务器巡检脚本

每天早上跑一遍,看看服务器有没有问题。

#!/bin/bash


RED='\033[0;31m'
GREEN='\033[0;32m'
NC='\033[0m'

echo "========== 服务器巡检 $(date '+%Y-%m-%d %H:%M') =========="


echo -e "\n【系统信息】"
echo "主机名: $(hostname)"
echo "IP地址: $(hostname -I | awk '{print $1}')"
echo "系统版本: $(cat /etc/os-release | grep PRETTY_NAME | cut -d'"' -f2)"
echo "运行时间: $(uptime -p)"

# 2. CPU
echo -e "\n【CPU】"
CPU_USAGE=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
echo "CPU使用率: ${CPU_USAGE}%"
LOAD=$(uptime | awk -F'load average:' '{print $2}')
echo "负载:${LOAD}"

# 3. 内存
echo -e "\n【内存】"
free -h | head -2

MEM_USAGE=$(free | grep Mem | awk '{print int($3/$2 * 100)}')
if [ $MEM_USAGE -gt 80 ]; then
    echo -e "${RED}警告: 内存使用率 ${MEM_USAGE}% 超过80%${NC}"
fi

# 4. 磁盘
echo -e "\n【磁盘】"
df -h | grep -E '^/dev/'

# 检查磁盘使用率
df -h | grep -E '^/dev/' | while read line; do
    USAGE=$(echo $line | awk '{print $5}' | tr -d '%')
    MOUNT=$(echo $line | awk '{print $6}')
    if [ $USAGE -gt 80 ]; then
        echo -e "${RED}警告: $MOUNT 使用率 ${USAGE}% 超过80%${NC}"
    fi
done

# 5. 网络
echo -e "\n【网络连接】"
echo "ESTABLISHED: $(netstat -an | grep ESTABLISHED | wc -l)"
echo "TIME_WAIT: $(netstat -an | grep TIME_WAIT | wc -l)"

# 6. 进程
echo -e "\n【Top 5 CPU进程】"
ps aux --sort=-%cpu | head -6 | tail -5

echo -e "\n【Top 5 内存进程】"
ps aux --sort=-%mem | head -6 | tail -5

# 7. 服务状态
echo -e "\n【服务状态】"
for svc in nginx mysql redis docker; do
    if systemctl is-active --quiet $svc 2>/dev/null; then
        echo -e "$svc: ${GREEN}运行中${NC}"
    else
        echo -e "$svc: ${RED}未运行${NC}"
    fi
done

echo -e "\n========== 巡检完成 =========="

2. 日志清理脚本

日志不清理,磁盘早晚爆。

#!/bin/bash


LOG_DIRS=(
    "/var/log/nginx"
    "/var/log/app"
    "/data/logs"
)

KEEP_DAYS=7

for dir in "${LOG_DIRS[@]}"; do
    if [ -d "$dir" ]; then
        echo "清理 $dir ..."


        find "$dir" -name "*.log.*" -mtime +$KEEP_DAYS -delete
        find "$dir" -name "*.gz" -mtime +$KEEP_DAYS -delete


        find "$dir" -name "*.log" -size +100M -exec truncate -s 0 {} \;

        echo "完成"
    fi
done


if command -v docker &> /dev/null; then
    echo "清理Docker日志..."
    truncate -s 0 /var/lib/docker/containers/*/*-json.log 2>/dev/null
fi


journalctl --vacuum-time=7d

echo "日志清理完成"

3. MySQL备份脚本

每天凌晨跑,备份到本地+远程。

#!/bin/bash


DB_HOST="localhost"
DB_USER="backup"
DB_PASS="your_password"
BACKUP_DIR="/backup/mysql"
DATE=$(date +%Y%m%d_%H%M)
KEEP_DAYS=7


REMOTE_HOST="[email protected]"  
REMOTE_DIR="/backup/mysql"

mkdir -p $BACKUP_DIR

echo "[$(date)] 开始备份..."


DATABASES=$(mysql -h$DB_HOST -u$DB_USER -p$DB_PASS -e "SHOW DATABASES" | grep -Ev "^(Database|information_schema|performance_schema|sys)$")

for db in $DATABASES; do
    echo "备份数据库: $db"
    mysqldump -h$DB_HOST -u$DB_USER -p$DB_PASS \
        --single-transaction \
        --routines \
        --triggers \
        $db | gzip > ${BACKUP_DIR}/${db}_${DATE}.sql.gz
done

echo "[$(date)] 本地备份完成"


if [ -n "$REMOTE_HOST" ]; then
    echo "同步到远程..."
    rsync -avz --delete $BACKUP_DIR/ $REMOTE_HOST:$REMOTE_DIR/
    echo "远程同步完成"
fi


find $BACKUP_DIR -name "*.sql.gz" -mtime +$KEEP_DAYS -delete

echo "[$(date)] 备份任务完成"

4. 批量执行命令脚本

管理多台服务器,一个个登录太麻烦。

#!/bin/bash



SERVERS=(
    "[email protected]"
    "[email protected]"
    "[email protected]"



)


CMD="$1"

if [ -z "$CMD" ]; then
    echo "用法: $0 '命令'"
    echo "示例: $0 'df -h'"
    exit 1
fi

for server in "${SERVERS[@]}"; do
    echo "========== $server =========="
    ssh -o ConnectTimeout=5 $server "$CMD" 2>/dev/null
    if [ $? -ne 0 ]; then
        echo "连接失败"
    fi
    echo ""
done

使用方法:

./batch_exec.sh "df -h"


./batch_exec.sh "uptime"


./batch_exec.sh "systemctl restart nginx"

5. 端口检测脚本

检测服务是否正常,不正常就告警。

#!/bin/bash


SERVICES=(
    "192.168.1.10:80:Nginx"
    "192.168.1.20:3306:MySQL"
    "192.168.1.30:6379:Redis"
)

WEBHOOK_URL="https://oapi.dingtalk.com/robot/send?access_token=xxx"

check_port() {
    local host=$1
    local port=$2
    timeout 3 bash -c "echo > /dev/tcp/$host/$port" 2>/dev/null
    return $?
}

send_alert() {
    local msg=$1
    curl -s -H "Content-Type: application/json" \
        -d "{\"msgtype\":\"text\",\"text\":{\"content\":\"[告警] $msg\"}}" \
        $WEBHOOK_URL > /dev/null
}

for item in "${SERVICES[@]}"; do
    IFS=':' read -r host port name <<< "$item"

    if check_port $host $port; then
        echo "$name ($host:$port) - OK"
    else
        echo "$name ($host:$port) - FAILED"
        send_alert "$name 服务异常 ($host:$port)"
    fi
done

加到crontab里,每分钟跑一次:

* * * * * /opt/scripts/check_ports.sh >> /var/log/check_ports.log 2>&1

6. 一键部署脚本

新服务器装环境用。

#!/bin/bash


echo "=== 服务器初始化 ==="


echo "[1/6] 更新系统..."
apt update && apt upgrade -y


echo "[2/6] 安装基础工具..."
apt install -y vim curl wget git htop iotop iftop net-tools \
    tree lsof telnet rsync zip unzip


echo "[3/6] 配置时区..."
timedatectl set-timezone Asia/Shanghai


echo "[4/6] 配置SSH..."
sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin no/' /etc/ssh/sshd_config
sed -i 's/#PasswordAuthentication yes/PasswordAuthentication no/' /etc/ssh/sshd_config
systemctl restart sshd


echo "[5/6] 配置防火墙..."
apt install -y ufw
ufw default deny incoming
ufw default allow outgoing
ufw allow ssh
ufw --force enable


echo "[6/6] 创建运维用户..."
useradd -m -s /bin/bash ops
mkdir -p /home/ops/.ssh


chown -R ops:ops /home/ops/.ssh
chmod 700 /home/ops/.ssh
chmod 600 /home/ops/.ssh/authorized_keys

echo "=== 初始化完成 ==="

7. 证书过期检测

Let's Encrypt证书90天过期,忘了续期就尴尬了。

#!/bin/bash


DOMAINS=(
    "www.example.com"
    "api.example.com"
)

ALERT_DAYS=14
WEBHOOK_URL="https://oapi.dingtalk.com/robot/send?access_token=xxx"

for domain in "${DOMAINS[@]}"; do

    EXPIRE_DATE=$(echo | openssl s_client -servername $domain -connect $domain:443 2>/dev/null | openssl x509 -noout -enddate 2>/dev/null | cut -d= -f2)

    if [ -z "$EXPIRE_DATE" ]; then
        echo "$domain: 获取证书失败"
        continue
    fi

    EXPIRE_TS=$(date -d "$EXPIRE_DATE" +%s)
    NOW_TS=$(date +%s)
    DAYS_LEFT=$(( ($EXPIRE_TS - $NOW_TS) / 86400 ))

    echo "$domain: 剩余 $DAYS_LEFT 天"

    if [ $DAYS_LEFT -lt $ALERT_DAYS ]; then
        curl -s -H "Content-Type: application/json" \
            -d "{\"msgtype\":\"text\",\"text\":{\"content\":\"[证书告警] $domain 证书将在 $DAYS_LEFT 天后过期\"}}" \
            $WEBHOOK_URL
    fi
done

8. 进程守护脚本

有些服务没有systemd管理,挂了得手动拉起。

#!/bin/bash


PROCESS_NAME="your_app"
START_CMD="/opt/app/start.sh"
LOG_FILE="/var/log/daemon.log"

check_process() {
    pgrep -f "$PROCESS_NAME" > /dev/null
    return $?
}

log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" >> $LOG_FILE
}

if check_process; then
    echo "$PROCESS_NAME 运行中"
else
    log "$PROCESS_NAME 未运行,正在启动..."
    $START_CMD
    sleep 3

    if check_process; then
        log "$PROCESS_NAME 启动成功"
    else
        log "$PROCESS_NAME 启动失败"

    fi
fi

9. 快速连接脚本

服务器多了,记不住IP。

#!/bin/bash


case $1 in
    web1)
        ssh [email protected]
        ;;
    web2)
        ssh [email protected]
        ;;
    db)
        ssh [email protected]
        ;;
    redis)
        ssh [email protected]
        ;;

    home)
        ssh [email protected]  
        ;;
    *)
        echo "用法: s [web1|web2|db|redis|home]"
        ;;
esac

使用:

s web1   
s db     

10. 一键回滚脚本

部署出问题,快速回滚。

#!/bin/bash


APP_DIR="/opt/app"
BACKUP_DIR="/opt/backup"


echo "可用版本:"
ls -lt $BACKUP_DIR | head -10

read -p "请输入要回滚的版本: " VERSION

if [ -d "$BACKUP_DIR/$VERSION" ]; then
    echo "开始回滚到 $VERSION ..."


    CURRENT=$(date +%Y%m%d_%H%M%S)_rollback
    cp -r $APP_DIR $BACKUP_DIR/$CURRENT


    rm -rf $APP_DIR/*
    cp -r $BACKUP_DIR/$VERSION/* $APP_DIR/


    systemctl restart app

    echo "回滚完成"
else
    echo "版本不存在"
fi

我的脚本管理方式

  1. 统一放/opt/scripts/ - 别到处乱放
  2. 加执行权限 - chmod +x *.sh
  3. 写注释 - 不然过几个月自己都看不懂
  4. Git管理 - 可以看变更历史
  5. 同步到所有服务器 - 用rsync或者ansible
rsync -avz /opt/scripts/ [email protected]:/opt/scripts/
rsync -avz /opt/scripts/ [email protected]:/opt/scripts/

如果服务器分布在不同网络,我一般用星空组网组到一起,然后用虚拟IP批量操作。


这些脚本都是实战中积累的,拿走直接用。有更好的欢迎评论区分享~