检查命令配置文件 /etc/nagios-plugins/config/
20.6.1. check_ping
nagios check_ping命令使用方法
具体如下: -H 主机地址 -w WARNING 状态: 响应时间(毫秒),丢包率 (%) 阀值 -c CRITICAL状态: 响应时间(毫秒),丢包率 (%) 阀值 -p 发送的包数 默认5个包 -t 超时时间 默认10秒 -4|-6 使用ipv4|ipv6 地址 默认ipv4
实例:
/usr/lib64/nagios/plugins/check_ping -H 74.125.71.106 -w 100.0,20% -c 200.0,50%
20.6.2. check_procs
# /usr/lib64/nagios/plugins/check_procs PROCS OK: 75 processes # /usr/lib64/nagios/plugins/check_procs -a mingetty PROCS OK: 6 processes with args 'mingetty' # /usr/lib64/nagios/plugins/check_procs -C crond PROCS OK: 1 process with command name 'crond'
20.6.3. check_users
监控如果有用户登陆就发出警告
# /usr/lib64/nagios/plugins/check_users -w 0 -c 5 USERS WARNING - 1 users currently logged in |users=1;0;5;0
监控用户上线5
# /usr/lib64/nagios/plugins/check_users -w 5 -c 50 USERS OK - 1 users currently logged in |users=1;5;50;0
20.6.4. check_http
命令定义
define command{ command_name check_http_404 command_line /usr/lib/nagios/plugins/check_http -H '$HOSTADDRESS$' -I '$HOSTADDRESS$' -e '404' } define command{ command_name check_http_status command_line /usr/lib/nagios/plugins/check_http -H '$HOSTADDRESS$' -I '$HOSTADDRESS$' -e '$ARG1$' } define command{ command_name check_http_url command_line /usr/lib/nagios/plugins/check_http -H '$HOSTADDRESS$' -I '$HOSTADDRESS$' -u '$ARG1$' }
默认HTTP健康检查超时时间是10秒,如果你的网站需要更长的时间才能打开可以使用-t参数修改默认Timeout时间
# 'check_http' command definition define command{ command_name check_http command_line /usr/lib/nagios/plugins/check_http -t 30 -H '$HOSTADDRESS$' -I '$HOSTADDRESS$' }
# /srv/nagios/libexec/check_http -H www.163.com HTTP OK: HTTP/1.0 200 OK - 657627 bytes in 1.772 second response time |time=1.771681s;;;0.000000 size=657627B;;;0 $ /usr/lib/nagios/plugins/check_http -H www.example.com -I 172.16.0.8 -s "HTTs" HTTP CRITICAL: HTTP/1.1 404 Not Found - string not found - 336 bytes in 0.001 second response time |time=0.000733s;;;0.000000 size=336B;;;0 $ /usr/lib/nagios/plugins/check_http -H www.example.com -I 172.16.0.8 -e '404' HTTP OK: Status line output matched "404" - 336 bytes in 0.001 second response time |time=0.000715s;;;0.000000 size=336B;;;0
20.6.5. check_mysql
命令参数
check_mysql [-d database] [-H host] [-P port] [-s socket] [-u user] [-p password] [-S] /usr/lib64/nagios/plugins/check_mysql -d dbname -H 202.176.120.10 -P 3306 -u test -p password Uptime: 254264 Threads: 16 Questions: 535110791 Slow queries: 21 Opens: 110 Flush tables: 1 Open tables: 81 Queries per second avg: 2104.547
20.6.5.1. check_mysql
$ /usr/lib64/nagios/plugins/check_mysql --hostname=172.16.1.5 --port=3306 --username=monitor --password=monitor Uptime: 27001 Threads: 8 Questions: 25280156 Slow queries: 14941 Opens: 1389932 Flush tables: 3 Open tables: 128 Queries per second avg: 936.267
20.6.5.2. mysql.cfg check_mysql_replication
cat >> /usr/lib64/nagios/plugins/check_mysql_replication <<EOF #!/bin/bash declare -a slave_is slave_is=($(mysql -h$1 -umonitor -pxmNhj -e "show slave status\G"|grep Running |awk '{print $2}')) if [ "${slave_is[0]}" = "Yes" -a "${slave_is[1]}" = "Yes" ] then echo "OK - Slave is running" exit 0 else echo "Critical - Slave is error" exit 2 fi EOF
sudo chmod +x /usr/lib64/nagios/plugins/check_mysql_replication /usr/lib64/nagios/plugins/check_mysql_replication 172.16.1.4 Critical - slave is error
vim /etc/nagios-plugins/config/mysql.cfg # 'check_mysql_replication' command definition define command{ command_name check_mysql_replication command_line /usr/lib/nagios/plugins/check_mysql_replication $HOSTADDRESS$ } define command{ command_name check_mysql_replication_host command_line /usr/lib/nagios/plugins/check_mysql_replication '$ARG1$' }
20.6.5.3. nrpe.cfg check_mysql_replication
nrpe.cfg
cat >> /usr/lib64/nagios/plugins/check_mysql_replication <<EOF #!/bin/bash declare -a slave_is slave_is=($(mysql -umonitor -pxmNhj -e "show slave status\G"|grep Running |awk '{print $2}')) if [ "${slave_is[0]}" = "Yes" -a "${slave_is[1]}" = "Yes" ] then echo "OK - slave is running" exit 0 else echo "Critical - slave is error" exit 2 fi EOF command[check_mysql_slave]=/usr/lib64/nagios/plugins/check_mysql_replication /usr/local/nagios/libexec/check_nrpe -H 192.168.1.1 /usr/local/nagios/libexec/check_nrpe -H 192.168.1.1 -c check_mysql_replication define service { host_name 192.168.10.232 service_description check_mysql_replication check_period 24x7 max_check_attempts 5 normal_check_interval 3 retry_check_interval 2 contact_groups mygroup notification_interval 5 notification_period 24x7 notification_options w,u,c,r check_command check_nrpe!check_mysql_replication }
20.6.6. Disk
20.6.6.1. disk.cfg
$ cat /etc/nagios-plugins/config/disk.cfg # 'check_disk' command definition define command{ command_name check_disk command_line /usr/lib/nagios/plugins/check_disk -w '$ARG1$' -c '$ARG2$' -e -p '$ARG3$' } # 'check_all_disks' command definition define command{ command_name check_all_disks command_line /usr/lib/nagios/plugins/check_disk -w '$ARG1$' -c '$ARG2$' -e } # 'ssh_disk' command definition define command{ command_name ssh_disk command_line /usr/lib/nagios/plugins/check_by_ssh -H '$HOSTADDRESS$' -C '/usr/lib/nagios/plugins/check_disk -w '\''$ARG1$' -c '\''$ARG2$'\'' -e -p '\''$ARG3$'\' } #### # use these checks, if you want to test IPv4 connectivity on IPv6 enabled systems #### # 'ssh_disk_4' command definition define command{ command_name ssh_disk_4 command_line /usr/lib/nagios/plugins/check_by_ssh -H '$HOSTADDRESS$' -C '/usr/lib/nagios/plugins/check_disk -w '\''$ARG1$'\'' -c '\''$ARG2$'\'' -e -p '\''$ARG3$'\' -4 }
20.6.6.2. check_disk
WARNING/CRITICAL 报警阀值
-w 10% -c 5% -w 100M -c 50M
-p, --path=PATH, --partition=PARTITION 参数监控路径,可以一次写多个参数
$ /usr/lib/nagios/plugins/check_disk -w 10% -c 5% -p / -p /opt -p /boot DISK OK - free space: / 23872 MB (66% inode=92%); /opt 99242 MB (47% inode=93%); /boot 276 MB (63% inode=99%);| /=11767MB;33792;35669;0;37547 /opt=110882MB;199232;210300;0;221369 /boot=160MB;414;437;0;460 $ /usr/lib/nagios/plugins/check_disk -w 100M -c 50M -p / -p /opt -p /boot DISK OK - free space: / 23872 MB (66% inode=92%); /opt 99242 MB (47% inode=93%); /boot 276 MB (63% inode=99%);| /=11768MB;37447;37497;0;37547 /opt=110882MB;221269;221319;0;221369 /boot=160MB;360;410;0;460
-x, --exclude_device=PATH 排除监控路径
/usr/lib64/nagios/plugins/check_disk -w 10% -c 5% -e -x /bak -x /u01
20.6.6.3. disk-smb.cfg
$ cat disk-smb.cfg # 'check_disk_smb' command definition define command{ command_name check_disk_smb command_line /usr/lib/nagios/plugins/check_disk_smb -H '$ARG1$' -s '$ARG2$' } # 'check_disk_smb_workgroup' command definition define command{ command_name check_disk_smb_workgroup command_line /usr/lib/nagios/plugins/check_disk_smb -H '$ARG1$' -s '$ARG2$' -W '$ARG3$' } # 'check_disk_smb_host' command definition define command{ command_name check_disk_smb_host command_line /usr/lib/nagios/plugins/check_disk_smb -a '$HOSTADDRESS$' -H '$ARG1$' -s '$ARG2$' } # 'check_disk_smb_workgroup_host' command definition define command{ command_name check_disk_smb_workgroup_host command_line /usr/lib/nagios/plugins/check_disk_smb -a '$HOSTADDRESS$' -H '$ARG1$' -s '$ARG2$' -W '$ARG3$' } # 'check_disk_smb_user' command definition define command{ command_name check_disk_smb_user command_line /usr/lib/nagios/plugins/check_disk_smb -H '$ARG1$' -s '$ARG2$' -u '$ARG3$' -p '$ARG4$' -w '$ARG5$' -c '$ARG6$' } # 'check_disk_smb_workgroup_user' command definition define command{ command_name check_disk_smb_workgroup_user command_line /usr/lib/nagios/plugins/check_disk_smb -H '$ARG1$' -s '$ARG2$' -W '$ARG3$' -u '$ARG4$' -p '$ARG5$' } # 'check_disk_smb_host_user' command definition define command{ command_name check_disk_smb_host_user command_line /usr/lib/nagios/plugins/check_disk_smb -a '$HOSTADDRESS$' -H '$ARG1$' -s '$ARG2$' -u '$ARG3$' -p '$ARG4$' } # 'check_disk_smb_workgroup_host_user' command definition define command{ command_name check_disk_smb_workgroup_host_user command_line /usr/lib/nagios/plugins/check_disk_smb -a '$HOSTADDRESS$' -H '$ARG1$' -s '$ARG2$' -W '$ARG3$' -u '$ARG4$' -p '$ARG5$' }
20.6.7. check_tcp
20.6.7.1. 端口检查
$ /usr/lib/nagios/plugins/check_tcp -H 172.16.1.2 -p 80 TCP OK - 0.000 second response time on port 80|time=0.000369s;;;0.000000;10.000000
20.6.7.2. Memcache
$ /usr/lib64/nagios/plugins/check_tcp -H localhost -p 11211 -t 5 -E -s 'stats\r\nquit\r\n' -e 'uptime' -M crit TCP OK - 0.001 second response time on port 11211 [STAT pid 29253 STAT uptime 36088 STAT time 1311100189 STAT version 1.4.5 STAT pointer_size 64 STAT rusage_user 3.207512 STAT rusage_system 50.596308 STAT curr_connections 10 STAT total_connections 97372 STAT connection_structures 84 STAT cmd_get 84673 STAT cmd_set 273 STAT cmd_flush 0 STAT get_hits 84336 STAT get_misses 337 STAT delete_misses 0 STAT delete_hits 0 STAT incr_misses 0 STAT incr_hits 0 STAT decr_misses 0 STAT decr_hits 0 STAT cas_misses 0 STAT cas_hits 0 STAT cas_badval 0 STAT auth_cmds 0 STAT auth_errors 0 STAT bytes_read 49280152 STAT bytes_written 46326517326 STAT limit_maxbytes 4294967296 STAT accepting_conns 1 STAT listen_disabled_num 0 STAT threads 4 STAT conn_yields 0 STAT bytes 1345 STAT curr_items 14 STAT total_items 241 STAT evictions 0 STAT reclaimed 135 END]|time=0.000658s;;;0.000000;5.000000
20.6.7.3. Redis
# /usr/lib64/nagios/plugins/check_tcp -H 192.168.2.1 -p 6379 -t 5 -E -s 'info\r\n' -q 'quit\r\n' -e 'uptime_in_days' -M crit TCP OK - 0.001 second response time on port 6379 [$1043 redis_version:2.4.10 redis_git_sha1:00000000 redis_git_dirty:0 arch_bits:64 multiplexing_api:epoll gcc_version:4.4.6 process_id:21331 uptime_in_seconds:18152153 uptime_in_days:210 lru_clock:1801614 used_cpu_sys:1579.41 used_cpu_user:2279.26 used_cpu_sys_children:54.32 used_cpu_user_children:54.11 connected_clients:2 connected_slaves:1 client_longest_output_list:0 client_biggest_input_buf:0 blocked_clients:0 used_memory:1158016 used_memory_human:1.10M used_memory_rss:1560576 used_memory_peak:1289920 used_memory_peak_human:1.23M mem_fragmentation_ratio:1.35 mem_allocator:jemalloc-2.2.5 loading:0 aof_enabled:0 changes_since_last_save:2 bgsave_in_progress:0 last_save_time:1423107828 bgrewriteaof_in_progress:0 total_connections_received:594376 total_commands_processed:1350747 expired_keys:12199 evicted_keys:0 keyspace_hits:511525 keyspace_misses:124116 pubsub_channels:0 pubsub_patterns:0 latest_fork_usec:361 vm_enabled:0 role:master slave0:192.168.6.1,58091,online db0:keys=1913,expires=7]|time=0.000815s;;;0.000000;5.000000
20.6.8. check_log
官方的 check_log 有很多缺陷,不能监控大文件。它的监控原理是cat log to oldlog 然后通过diff比较
20.6.9. check_traffic
http://exchange.nagios.org/directory/Plugins/Network-Connections,-Stats-and-Bandwidth/check_traffic-2Esh/details
https://github.com/cloved/check_traffic
网卡流量监测
20.6.10. Nagios nrpe plugins
nrpe 插件接收来自nagios-nrpe-server数据报告
cat /etc/nagios3/hosts/host.example.org.cfg define host{ use generic-host ; Inherit default values from a template host_name host.example.org ; The name we're giving to this host alias Some Remote Host ; A longer name associated with the host address 172.16.1.3 ; IP address of the host hostgroups all ; Host groups this host is associated with } # NRPE disk check. define service { use generic-service host_name backup service_description nrpe-disk check_command check_nrpe_1arg!check_all_disks!172.16.1.3 } define service { use generic-service host_name backup service_description nrpe-users check_command check_nrpe_1arg!check_users!172.16.1.3 } define service { use generic-service host_name backup service_description nrpe-swap check_command check_nrpe_1arg!check_swap!172.16.1.3 } define service { use generic-service host_name backup service_description nrpe-procs check_command check_nrpe_1arg!check_procs!172.16.1.3 }
20.6.11. check_nt
Define windows services that should be monitored.
# Define a host for the Windows machine we'll be monitoring # Change the host_name, alias, and address to fit your situation define host{ use windows-server ; Inherit default values from a template host_name remote-windows-host ; The name we're giving to this host alias Remote Windows Host ; A longer name associated with the host address 192.168.1.4 ; IP address of the remote windows host } define service{ use generic-service host_name remote-windows-host service_description NSClient++ Version check_command check_nt!CLIENTVERSION } define service{ use generic-service host_name remote-windows-host service_description Uptime check_command check_nt!UPTIME } define service{ use generic-service host_name remote-windows-host service_description CPU Load check_command check_nt!CPULOAD!-l 5,80,90 } define service{ use generic-service host_name remote-windows-host service_description Memory Usage check_command check_nt!MEMUSE!-w 80 -c 90 } define service{ use generic-service host_name remote-windows-host service_description C:\ Drive Space check_command check_nt!USEDDISKSPACE!-l c -w 80 -c 90 } define service{ use generic-service host_name remote-windows-host service_description W3SVC check_command check_nt!SERVICESTATE!-d SHOWALL -l W3SVC } define service{ use generic-service host_name remote-windows-host service_description Explorer check_command check_nt!PROCSTATE!-d SHOWALL -l Explorer.exe }
Enable Password Protection
define command{ command_name check_nt command_line $USER1$/check_nt -H $HOSTADDRESS$ -p 12489 -s My2Secure$Password -v $ARG1$ $ARG2$ }
20.6.12. nsca - Nagios Service Check Acceptor
# yum install nsca
20.6.13. jmx
nagios plugin to check jmx
https://code.google.com/p/jmxquery/
wget https://jmxquery.googlecode.com/files/jmxquery-1.3-bin.zip unzip jmxquery-1.3-bin.zip chmod +x check_jmx
# ./check_jmx -help Usage: check_jmx [-option...] -U url -O object -A attribute (to query an attribute) or check_jmx [-option...] -U url -O object -M method (to invoke a zero-argument method) or check_jmx -help (to display this help page) Mandatory parameters are: -U JMX URL, for example: "service:jmx:rmi:///jndi/rmi://localhost:1616/jmxrmi" -O Object name to be checked, for example, "java.lang:type=Memory" -A Attribute of the object to be checked, for example, "NonHeapMemoryUsage" (not compatible with -M switch) -M Zero-argument method to be invoked (not compatible with -A switch) Options are: -K <key> Key for compound data, for example, "used" -I <info attribute> Attribute of the object containing information for text output -J <info attribute key> Attribute key for -I attribute compound data, for example, "used" -v[v[v[v]]] Verbatim level controlled as a number of v -w <limit> Warning long value -c <limit> Critical long value -default <value> Use default value if requested object/attribute/method does not exist -username <user name> -password <password> Credentials for JMX Note that if warning level > critical, system checks object attribute value to be LESS THAN OR EQUAL warning, critical If warning level < critical, system checks object attribute value to be MORE THAN OR EQUAL warning, critical
例 20.1.
# ./check_jmx -U service:jmx:rmi:///jndi/rmi://localhost:9012/jmxrmi -O java.lang:type=Memory -A HeapMemoryUsage -K used -I HeapMemoryUsage -J used -vvvv -w 731847066 -c 1045495808 JMX OK - HeapMemoryUsage.used=98617544 | HeapMemoryUsage.used=98617544,committed=514850816;init=536870912;max=7635730432;used=98617544
# ./check_jmx -U service:jmx:rmi:///jndi/rmi://localhost:9012/jmxrmi -O org:type=Spring,name=BackgroundService -A QueueSize -w 10 -c 20 JMX CRITICAL - org:type=Spring,name=BackgroundService
原文出处:Netkiller 系列 手札
本文作者:陈景峯
转载请与作者联系,同时请务必标明文章原始出处和作者信息及本声明。