公司服务器越来越多了,本来用一个脚本去检测了,现在改用Nagios
ubuntu 客户端安装脚本
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
|
#!/bin/bash
tmp_dir=
/tmp/nagios
nagios_ser=
"192.168.1.3"
groupadd nagios
useradd
-g nagios -s
/sbin/nologin
nagios
if
[ ! -d $tmp_dir ];
then
mkdir
$tmp_dir
fi
cd
$tmp_dir
wget http:
//downloads
.sourceforge.net
/project/nagios/nrpe-2
.x
/nrpe-2
.15
/nrpe-2
.15.
tar
.gz
wget http:
//nagios-plugins
.org
/download/nagios-plugins-2
.0.1.
tar
.gz
#---- install
for
i
in
`
ls
-1`
do
tar
xf $i
done
apt-get -y --force-
yes
install
openssl ruby1.9.1 build-essential
apt-get -y --force-
yes
install
libssl-dev lm-sensors
tar
xvf nagios-plugins-2.0.1.
tar
.gz
cd
nagios-plugins-2.0.1
.
/configure
--with-nagios-user=nagios --with-nagios-group=nagios
make
make
install
cd
../
tar
xvf nrpe-2.15.
tar
.gz
cd
.
/nrpe-2
.15
.
/configure
--with-ssl-lib=
/usr/lib/x86_64-linux-gnu
make
all
make
install
-plugin
make
install
-daemon
make
install
-daemon-config
#mv ./check_* /usr/local/nagios/libexec
#chmod 755 -R /usr/local/nagios/libexec
chown
-R nagios:nagios
/usr/local/nagios/
cat
>
/usr/local/nagios/etc/nrpe
.cfg<<EOF
log_facility=daemon
pid_file=
/var/run/nrpe
.pid
server_port=5666
nrpe_user=nagios
nrpe_group=nagios
allowed_hosts=127.0.0.1,$nagios_ser
dont_blame_nrpe=0
allow_bash_command_substitution=0
debug=0
command_timeout=60
connection_timeout=300
command
[check_users]=
/usr/local/nagios/libexec/check_users
-w 5 -c 10
command
[check_load]=
/usr/local/nagios/libexec/check_load
-w 15,10,5 -c 30,25,20
command
[check_zombie_procs]=
/usr/local/nagios/libexec/check_procs
-w 5 -c 10 -s Z
command
[check_total_procs]=
/usr/local/nagios/libexec/check_procs
-w 150 -c 200
command
[check_procs]=
/usr/local/nagios/libexec/check_procs
-w 150 -c 200
command
[check_alldisk]=
/usr/local/nagios/libexec/check_alldisk
-w 90 -c 95
command
[check_http]=
/usr/local/nagios/libexec/check_http
-H 127.0.0.1 -w 5 -c 10
command
[check_ping]=
/usr/local/nagios/libexec/check_ping
-H 127.0.0.1 -w 3000.0,80% -c 5000.0,100% -p 5
command
[check_ssh]=
/usr/local/nagios/libexec/check_ssh
-4 127.0.0.1
command
[check_swap]=
/usr/local/nagios/libexec/check_swap
-w 30% -c 10%
command
[check_sensors]=
/usr/local/nagios/libexec/check_sensors
command
[check_mdadm]=
/usr/local/nagios/libexec/check_mdadm
command
[check_smart]=
/usr/local/nagios/libexec/check_smart
command
[check_drbd]=
/usr/local/nagios/libexec/check_drbd
EOF
echo
"/usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d"
>>
/etc/rc
.
local
/usr/local/nagios/bin/nrpe
-c
/usr/local/nagios/etc/nrpe
.cfg -d
rm
-rf $tmp_dir
|
自己折腾的ruby脚本,
1:check_smart 磁盘状态检测
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
|
#!/usr/bin/env ruby
#0 ok; 1 warning; 2 critical; 3 unknown
#echo "nagios ALL=NOPASSWD:/usr/sbin/smartctl" >>/etc/sudoers
#CentOS sed -i "s:Defaults requiretty:Defaults:nagios !requiretty:" /etc/sudoers
#调用 check_nrpe!check_smart
health =
""
`ls -
1
/dev/sd[a-z]* | grep [a-z]$`.split.
each
do
|hdd|
status = `sudo /usr/sbin/smartctl -
H
#{hdd} | grep result | awk -F: '{print $2}'`
if
status.match(/
PASSED
/)
health = health + hdd +
" OK\n"
else
health = health + hdd +
" Fail\n"
end
end
if
health.include?
"Fail"
puts health
exit
2
end
puts health
exit
0
|
2:check_mdadm 软阵列检测
1
2
3
4
5
6
7
8
9
10
|
#!/usr/bin/env ruby
#0 ok; 1 warning; 2 critical; 3 unknown
status = `cat /proc/mdstat`
if
status.scan(
'U'
).size == status.scan(
'md'
).size *
2
puts
"Soft Raid OK"
exit
0
else
puts
"Soft Raid Fail"
exit
2
end
|
3:check_drbd DRBD检测
1
2
3
4
5
6
7
8
9
|
#!/usr/bin/ruby
#0 ok; 1 warning; 2 critical; 3 unknown
if
`cat /proc/drbd`.scan(
"UpToDate"
).count == `ls -la /dev/ | grep ^b | grep drbd | wc -l`.to_i *
2
puts
"DRBD OK"
exit
0
else
puts
"DRBD Critical"
exit
2
end
|
4:check_alldisk 检测磁盘空间
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
|
#!/usr/bin/env ruby
#ARGV[1] min ,ARGV[3] max
# -w 90 -c 95
#0 ok; 1 warning; 2 critical; 3 unknown
space =
''
status = `df -hl -x tmpfs -x devtmpfs | grep -v ^Filesystem`.split
if
status.size <
6
#unkown
puts
"UNKOWN"
exit
3
end
(status.size /
6
).times
do
|x|
current_use, min_use, max_use = status[
4
+ x *
6
][
0
..-
2
].to_i,
ARGV
[
1
].to_i,
ARGV
[
3
].to_i
if
current_use > max_use
#critical
space = space + status[x *
6
] +
" "
+ status[
4
+ x *
6
] +
" "
+ status[
5
+ x *
6
] +
" Critical\n"
elsif
current_use > min_use
and
current_use <= max_use
#warning
space = space + status[x *
6
] +
" "
+ status[
4
+ x *
6
] +
" "
+ status[
5
+ x *
6
] +
" Warning\n"
elsif
current_use <= min_use
#ok
space = space + status[x *
6
] +
" "
+ status[
4
+ x *
6
] +
" "
+ status[
5
+ x *
6
] +
" OK\n"
end
end
if
space.include?(
"Crtitical"
)
puts space
exit
2
elsif
space.include?(
"Warning"
)
puts space
exit
1
else
puts space
exit
0
end
|
本文转自 nonono11 51CTO博客,原文链接:http://blog.51cto.com/abian/1412478,如需转载请自行联系原作者