效果展示
===================== 2022/05/20-09:12:11+0000 =====================
===================== check system =====================
[INFO] [2022/05/20-09:12:11+0000] Hostname: test-master-01
[INFO] [2022/05/20-09:12:11+0000] Ipaddress: ## 和谐了,不给看 ##
[INFO] [2022/05/20-09:12:11+0000] Os-release: CentOS Linux 7 (Core) GNU/Linux
[INFO] [2022/05/20-09:12:11+0000] Kernel: Linux 3.10.0-1127.19.1.el7.x86_64 x86_64 GNU/Linux
[INFO] [2022/05/20-09:12:11+0000] Up Days: 143 days
[INFO] [2022/05/20-09:12:11+0000] Os Language: en_US.UTF-8
===================== check cpu =====================
[INFO] [2022/05/20-09:12:11+0000] CPU Model: AMD EPYC 7571
[INFO] [2022/05/20-09:12:11+0000] Physical CPUS: 1
[INFO] [2022/05/20-09:12:11+0000] Processor CPUS: 4
[INFO] [2022/05/20-09:12:11+0000] CPU Cores: 2
[INFO] [2022/05/20-09:12:11+0000] Load Average: 0.55 , 0.44 , 0.51
[INFO] [2022/05/20-09:12:11+0000] CPU Usage: 15.66%
===================== check memory =====================
[INFO] [2022/05/20-09:12:11+0000] Mem Total: 15.14GiB
[INFO] [2022/05/20-09:12:11+0000] Mem Used: 12.07GiB
[INFO] [2022/05/20-09:12:11+0000] Mem Available: 3.07GiB
[INFO] [2022/05/20-09:12:11+0000] Mem Usage: 79.73%
===================== check disk =====================
[INFO] [2022/05/20-09:12:11+0000] Disk Info:
[INFO] [2022/05/20-09:12:11+0000] /dev/nvme0n1p1 xfs 100G 55G 46G 55% /
[INFO] [2022/05/20-09:12:11+0000] Disk Inode Info:
[INFO] [2022/05/20-09:12:11+0000] /dev/nvme0n1p1 xfs 50M 722K 50M 2% /
===================== check kubernetes =====================
[INFO] [2022/05/20-09:12:11+0000] Apiserver Cert Not After: Mar 16 07:45:06 2023 GMT
[INFO] [2022/05/20-09:12:11+0000] Node Status: test-master-01 is Ready
[INFO] [2022/05/20-09:12:11+0000] Node Status: test-master-02 is Ready
[INFO] [2022/05/20-09:12:11+0000] Node Status: test-master-03 is Ready
[INFO] [2022/05/20-09:12:11+0000] Node Status: test-node-01 is Ready
[INFO] [2022/05/20-09:12:11+0000] Node Status: test-node-02 is Ready
[INFO] [2022/05/20-09:12:11+0000] Node Status: test-node-03 is Ready
[INFO] [2022/05/20-09:12:11+0000] Node Status: test-node-04 is Ready
[INFO] [2022/05/20-09:12:11+0000] Node Status: test-node-05 is Ready
[INFO] [2022/05/20-09:12:11+0000] Top Nodes: test-master-01 552m 13% 12590Mi 81%
[INFO] [2022/05/20-09:12:11+0000] Top Nodes: test-master-02 399m 9% 9644Mi 62%
[INFO] [2022/05/20-09:12:11+0000] Top Nodes: test-master-03 534m 13% 10336Mi 67%
[INFO] [2022/05/20-09:12:11+0000] Top Nodes: test-node-01 679m 16% 21175Mi 67%
[INFO] [2022/05/20-09:12:11+0000] Top Nodes: test-node-02 591m 14% 21119Mi 66%
[INFO] [2022/05/20-09:12:11+0000] Top Nodes: test-node-03 674m 16% 23677Mi 75%
[INFO] [2022/05/20-09:12:11+0000] Top Nodes: test-node-04 564m 14% 23123Mi 73%
[INFO] [2022/05/20-09:12:11+0000] Top Nodes: test-node-05 558m 13% 22760Mi 72%
目录结构
├── config
│ └── conf.sh
└── inspection.sh
config/conf.sh
#!/usr/bin/env bash
# 需要检查的目录
## 有的场景下,数据目录是单独挂载的磁盘,也要巡检
## / 根目录也要巡检,不要删除
disk_lists='
/
/data
'
# CPU 使用率告警上线
cpu_limit='85%'
# 内存使用率告警上线
mem_limit='85%'
# 磁盘使用率告警上线
## kubelet 的默认驱逐条件是磁盘使用率超过85%
## 如果有kubelet 服务,建议设定在 70% - 80% 之间
disk_limit='75%'
# 磁盘 inode 使用率告警上线
disk_inode_limit='85%'
# apiserver 证书的绝对路径
## kubeadm 默认为 /etc/kubernetes/pki/apiserver.crt
api_cert_file='/etc/kubernetes/pki/apiserver.crt'
# 证书剩余多少天到期时间提醒
cert_expires='30'
# kubectl 命令证书路径
kube_config='/root/.kube/config'
inspection.sh
#!/usr/bin/env bash
# 定义脚本当前所在路径
base_dir=$(cd `dirname "$0"`; pwd)
# 定义配置文件的路径和名称
conf_file="${base_dir}/config/conf.sh"
# 定义日志存储目录
log_dir="${base_dir}/logs"
# 定义标准日志文件名称
log_file="${log_dir}/$(date +%Y-%m-%d)-INFO.log"
# 定义告警日志文件名称
warn_log="${log_dir}/$(date +%Y-%m-%d)-WARN.log"
# 定义时间格式
time_style="$(date +%Y/%m/%d-%T%z)"
# 定义 df 命令的参数,可以根据实际情况进行修改
df_cmd="df -Th -x devtmpfs -x tmpfs -x debugfs -x aufs -x overlay -x fuse.glusterfs"
# 定义日志压缩时间,数字表示多少天
tar_time=7
# 定义日志压缩路径
tar_dir=$(date +%Y-%m-%d -d "${tar_time} days ago")
# 定义 tar 包名称
tar_name="${tar_dir}.tgz"
function check_config () {
# 检查配置文件是否存在
if [[ -f "${conf_file}" ]];then
# 调用配置文件内的变量
source ${conf_file}
# disk_lists 变量值为空,则 disk_lists 变量值默认为 /
disk_lists=${disk_lists:-'/'}
# cpu_limit 变量值为空,则 cpu_limit 变量值默认为 85%
cpu_limit=${cpu_limit:-'85%'}
# mem_limit 变量值为空,则 mem_limit 变量值默认为 85%
mem_limit=${mem_limit:-'85%'}
# disk_limit 变量值为空,则 disk_limit 变量值默认为 75%
## 因为 kubelet 默认的驱逐机制是磁盘使用率超过 85%
disk_limit=${disk_limit:-'75%'}
# disk_inode_limit 变量值为空,则 disk_inode_limit 变量值默认为 85%
disk_inode_limit=${disk_inode_limit:-'85%'}
# api_cert_file 变量值为空,则 api_cert_file 变量值默认为 /etc/kubernetes/pki/apiserver.crt
api_cert_file=${api_cert_file:-'/etc/kubernetes/pki/apiserver.crt'}
# cert_expires 变量值为空,则 cert_expires 变量值默认为 30
cert_expires=${cert_expires:-'30'}
# kube_config 变量值为空,则 kube_config 变量值默认为 /root/.kube/config
kube_config=${kube_config:-'/root/.kube/config'}
kube_cmd="kubectl --kubeconfig ${kube_config}"
else
# 配置文件不存在则退出脚本,并告知配置文件不存在
echo "${conf_file} is not found, please check it !"
exit 0
fi
}
function check_user () {
local wai=$(id -u -n)
# 当前用户不是 root 则退出脚本,并告知需要使用 root 用户执行
if [[ "${wai}"x != "root"x ]];then
printf "\e[1;31mPlease use the root to execute this shell !\e[0m\n"
exit 0
fi
}
function print_terminal () {
printf "\e[1;34m[INFO] [${time_style}] ${*}\e[0m\n"
}
function print_info_title () {
if [[ ! -f "${log_file}" ]];then
echo "===================== ${*} =====================" >> ${log_file}
else
echo " " >> ${log_file}
echo "===================== ${*} =====================" >> ${log_file}
fi
}
function print_warn_title () {
if [[ ! -f "${warn_log}" ]];then
echo "===================== ${*} =====================" >> ${warn_log}
else
echo " " >> ${warn_log}
echo "===================== ${*} =====================" >> ${warn_log}
fi
}
function check_warn_title () {
grep "${*}" ${warn_log} &> /dev/null || print_warn_title "${*}"
}
function print_info () {
# 标准日志输出格式
echo "[INFO] [${time_style}] ${*}" >> ${log_file}
}
function print_warn () {
# 告警日志输出格式
echo "[WARN] [${time_style}] ${*}" >> ${warn_log}
}
function check_log_dir () {
# 检查日志目录是否存在
[[ -d ${log_dir} ]] || mkdir -p ${log_dir}
# 检查当天巡检日志文件是否存在
[[ ! -f ${log_file} ]] || mv ${log_file}{,-$(date +%T%z)}
[[ ! -f ${warn_log} ]] || mv ${warn_log}{,-$(date +%T%z)}
print_info_title "${time_style}"
print_warn_title "${time_style}"
}
function check_tar () {
# 判断指定时间之前是否存在日志文件,存在日志文件则对文件进行压缩
## 修改 tar_time 变量可以指定天数
local check_num=$(find ${log_dir} -mtime +${tar_time} -name *.log* | wc -l)
# 判断指定时间之前是否存在打包文件,存在则删除
local check_tarnum=$(find ${log_dir} -mtime +${tar_time} -name *.tar.gz | wc -l)
# 判断指定天数前的文件数量,大于等于 1 的情况下才做处理
if [[ "${check_num}" > 0 ]];then
[[ -d "${log_dir}/${tar_dir}" ]] || mkdir -p "${log_dir}/${tar_dir}"
[[ ! -f "${log_dir}/${tar_dir}/${tar_name}" ]] || mv ${log_dir}/${tar_dir}/${tar_name}{,-$(date +%T%z)}
find ${log_dir} -mtime +${tar_time} -name *.log* -exec mv {} ${log_dir}/${tar_dir} \; &> /dev/null
cd ${log_dir} && tar czf ${tar_name} ${tar_dir}/* && rm -rf ${tar_dir}
fi
# 判断指定天数之前的打包文件梳理,大于等于 1 的情况下才做处理
if [[ "${check_tarnum}" > 0 ]];then
find ${log_dir} -mtime +${tar_time} -name *.tar.gz -exec rm -f {} \;
fi
print_terminal "check logs done"
}
function check_system () {
# 系统相关信息检查
print_info_title 'check system'
# 主机名
get_hostname="$(cat /etc/hostname)"
print_info "Hostname: ${get_hostname}"
# ip 地址 [银联有双网卡的情况,并且无法使用 hostname -i 命令获取 ip 地址]
## k8s 全部使用的主机名,因此改用过滤 hosts 解析文件的方式来获取 ip 地址
local get_host_ip=$(hostname -i)
print_info "Ipaddress: ${get_host_ip}"
# 发行版
local get_os_release="$(awk -F '"' '/PRETTY_NAME/ {print $2}' /etc/os-release)"
print_info "Os-release: ${get_os_release} $(uname -o)"
# 内核
local get_kernel="$(uname -srmo)"
print_info "Kernel: ${get_kernel}"
# 服务器启动时长
local get_up_secs="$(awk -F '.' '{print $1}' /proc/uptime)"
local get_days="$(( ${get_up_secs} / 60 / 60 / 24 ))"
print_info "Up Days: ${get_days} days"
# 语言
local os_lang=$(echo $LANG)
print_info "Os Language: ${os_lang}"
# swap 是否关闭
local chech_swap=$(grep -iv size /proc/swaps | wc -l)
if [[ "${chech_swap}" == "0" ]];then
print_info "Swap Status: off"
else
check_warn_title 'check system'
swapoff -a
print_info "Swap Status: manual off"
fi
# firewalld 是否关闭
local firewalld_status=$(systemctl is-active firewalld)
local firewalld_enable=$(systemctl is-enabled firewalld)
if [[ "${firewalld_status}"x == "inactive"x ]];then
print_info "Firewalld Status: dead"
else
check_warn_title 'check system'
systemctl stop firewalld
print_warn "Firewalld Status: manual dead"
fi
if [[ "${firewalld_enable}"x == "disabled"x ]];then
print_info "Firewalld Enabled: disabled"
else
check_warn_title 'check system'
systemctl disable firewalld
print_warn "Firewalld Enabled: manual disabled"
fi
print_terminal "check system done"
}
function check_cpu () {
print_info_title "check cpu"
# cpu 信息
local physical_cpus="$(grep "^physical id" /proc/cpuinfo | sort | uniq | wc -l)"
local process_cpus="$(grep -c "^processor" /proc/cpuinfo)"
local core_cpus="$(grep '^cpu cores' /proc/cpuinfo | tail -1 | awk '{print $NF}')"
local cpu_model="$(grep "^model name" /proc/cpuinfo | awk -F ': ' '{print $2}' | sort | uniq)"
print_info "CPU Model: ${cpu_model}"
print_info "Physical CPUS: ${physical_cpus}"
print_info "Processor CPUS: ${process_cpus}"
print_info "CPU Cores: ${core_cpus}"
# cpu 负载
local one_min="$(awk '{print $1}' /proc/loadavg)"
local five_min="$(awk '{print $2}' /proc/loadavg)"
local fif_min="$(awk '{print $3}' /proc/loadavg)"
print_info "Load Average: ${one_min} , ${five_min} , ${fif_min}"
# 检查 cpu 使用率
local cpu_util="$(awk '/cpu / {util=($2+$4)*100/($2+$4+$5); printf ("%.2f%"), util}' /proc/stat)"
print_info "CPU Utilization: ${cpu_util}"
# cpu 使用率超过 cpu_limit 配置的数值,打印 WARN 日志
if [[ "${cpu_util%%.*}" -ge "${cpu_limit%%%}" ]];then
local top_cpu_use="$(ps -eo user,pid,pcpu,args --sort=-pcpu | head -n 10)"
check_warn_title 'check cpu'
print_warn "CPU utilization is ${cpu_util} , it's greater equal ${cpu_limit}, should be check !"
# CPU 使用前十进程
print_warn "Top 10 CPU Use: "
echo "${top_cpu_use}" >> ${warn_log}
fi
print_terminal "check cpu done"
}
function check_mem () {
print_info_title "check memory"
# 检查内存使用率
local get_mem_info="$(awk '/MemTotal:/{total=$2/1024/1024;next} /MemAvailable:/{available=$2/1024/1024;use=total-available; printf("%.2fGiB %.2fGiB %.2fGiB %.2f%"),total,use,available,(use/total)*100}' /proc/meminfo)"
# 内存总大小
local mem_total="$(awk '{print $1}' <<< ${get_mem_info})"
# 已使用的内存大小
local mem_used="$(awk '{print $2}' <<< ${get_mem_info})"
# 可以内存的大小
local mem_available="$(awk '{print $3}' <<< ${get_mem_info})"
# 使用中内存的大小
local mem_util="$(awk '{print $4}' <<< ${get_mem_info})"
# 内存使用率最高的十个进程
local top_mem_use="$(ps -eo user,pid,pmem,args --sort=-pmem | head -n 10)"
print_info "Mem Total: ${mem_total}"
print_info "Mem Used: ${mem_used}"
print_info "Mem Available: ${mem_available}"
print_info "Mem Utilization: ${mem_util}"
# 内存使用率超过 mem_limit 配置的数值,打印 WARN 日志
if [[ "${mem_util%%.*}" -ge "${mem_limit%%%}" ]];then
check_warn_title 'check memory'
print_warn "Mem utilization is ${mem_util}, it's greater equal ${mem_limit}, should be check !"
# 内存使用前十进程
print_warn "Top 10 Mem Use: "
echo "${top_mem_use}" >> ${warn_log}
fi
print_terminal "check memory done"
}
function check_disk () {
print_info_title "check disk"
print_info "Disk Info: "
# 检查磁盘使用率
local disk_lists_array=($(printf "%q\n" ${disk_lists}))
for (( i=0; i<${#disk_lists_array[@]}; i++ ))
do
local disk_info=$(${df_cmd} | egrep "${disk_lists_array[i]}$")
# df 使用了 -T 参数,因此使用率是第 6 列,如果有修改 df 参数,注意确认使用率的列数,并修改下面的位置变量
local disk_util="$(awk '{print $6}' <<< ${disk_info})"
local disk_name="$(awk '{print $NF}' <<< ${disk_info})"
[[ "${disk_info}"x != ""x ]] || break
print_info "${disk_info}"
# 磁盘使用率超过 disk_limit 配置的数值,打印 WARN 日志
if [[ "${disk_util%%%}" -ge "${disk_limit%%%}" ]];then
check_warn_title 'check disk'
print_warn "Disk ${disk_name} utilization is ${disk_util}, it's greater equal ${disk_limit}, should be check !"
fi
done
# 检查 inode 使用率
print_info '---'
print_info "Disk Inode Info: "
for (( i=0; i<${#disk_lists_array[@]}; i++ ))
do
local disk_inode_info=$(${df_cmd} -i | egrep "${disk_lists_array[i]}$")
# df 使用了 -T 参数,因此使用率是第 6 列,如果有修改 df 参数,注意确认使用率的列数,并修改下面的位置变量
local disk_inode_util="$(awk '{print $6}' <<< ${disk_inode_info})"
local disk_inode_name="$(awk '{print $NF}' <<< ${disk_inode_info})"
[[ "${disk_inode_info}"x != ""x ]] || break
print_info "${disk_inode_info}"
# 磁盘 inode 使用率超过 disk_limit 配置的数值,打印 WARN 日志
if [[ "${disk_inode_util%%%}" -ge "${disk_inode_limit%%%}" ]];then
check_warn_title 'check disk'
print_warn "Disk ${disk_inode_name} utilization is ${disk_inode_util}, it's greater equal ${disk_inode_limit}, should be check !"
fi
done
print_terminal "check disk done"
}
function check_kubernetes () {
print_info_title "check kubernetes"
if [[ -f ${api_cert_file} ]];then
# apiserver 证书到期时间
local cert_info="$(openssl x509 -in ${api_cert_file} -noout -text | awk -F ': ' '/Not After/ {print $2}')"
local cert_time_stamp=$(date -d "${cert_info}" +%s)
local cert_not_after="$(( (${cert_time_stamp} - $(date +%s)) / 86400 ))"
print_info "Apiserver Cert Not After: ${cert_info}"
if [[ "${cert_not_after}" -le "${cert_expires}" ]];then
check_warn_title 'check kubernetes'
print_warn "The apiserver cert will expire in ${cert_expires} days, please renewal !"
fi
fi
if [[ -f "${kube_config}" ]];then
# 节点是否都为 Ready 状态
local k8s_nodes_lists=$(${kube_cmd} get node --no-headers=true | awk '{print $1}')
local k8s_lists_array=($(printf "%q\n" ${k8s_nodes_lists}))
for (( h=0; h<${#k8s_lists_array[@]}; h++ ))
do
local node_status=$(${kube_cmd} get nodes | awk "/${k8s_lists_array[h]}/ {print \$2}")
if [[ "${node_status}"x == "Ready"x ]];then
print_info "Node Status: ${k8s_lists_array[h]} is Ready"
else
check_warn_title 'check kubernetes'
print_warn "Node: ${k8s_lists_array[h]} is NotReady , please check !"
fi
done
# top node 查看 k8s 集群资源使用情况
${kube_cmd} top node &> /dev/null
if [[ "$?" -eq '0' ]];then
for (( tn=0; tn<${#k8s_lists_array[@]}; tn++ ))
do
local k_top_node=$(${kube_cmd} top node | awk "/${k8s_lists_array[tn]}/ {print \$0}")
local node_cpu_usage="$(awk '{print $3}' <<< ${k_top_node})"
local node_mem_usage="$(awk '{print $5}' <<< ${k_top_node})"
print_info "Top Nodes: ${k_top_node}"
if [[ "${node_cpu_usage%%%}" -ge "${cpu_limit%%%}" ]];then
check_warn_title 'check kubernetes'
print_warn "${k8s_lists_array[tn]} top node check: cpu usage is ${node_cpu_usage}, it's greater equal ${cpu_limit}, should be check !"
fi
if [[ "${node_mem_usage%%%}" -ge "${mem_limit%%%}" ]];then
check_warn_title 'check kubernetes'
print_warn "${k8s_lists_array[tn]} top node check: cpu usage is ${node_mem_usage}, it's greater equal ${mem_limit}, should be check !"
fi
done
fi
else
print_info "This node's role is the work for kubernetes cluster"
fi
}
check_config
check_user
check_log_dir
check_tar
check_system
check_cpu
check_mem
check_disk
check_kubernetes