我们以前使用过的对hbase和hdfs进行健康检查,及剩余hdfs容量告警,简单易用
1.针对hadoop2的脚本:
#/bin/bash
bin=`dirname 0‘bin=‘cdbin;pwd`
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3
STATE_DEPENDENT=4
source /etc/profile
DFS_REMAINING_WARNING=15
DFS_REMAINING_CRITICAL=5
ABNORMAL_QUERY="INCONSISTENT|CORRUPT|FAILED|Exception"
HADOOP_WEB_INTERFACE=h001.hadoop
HBASE_WEB_INTERFACE=h008.hadoop
# hbck and fsck report
output=/var/log/cluster-status
hbase hbck >> outputhadoopfsck/apps/hbase>>output
# check report
count=`egrep -c "ABNORMALQUERY"output`
if [ count−eq0];thenecho"[OK]Clusterishealthy.">>output
else
echo "[ABNORMAL] Cluster is abnormal!" >> output # Get the last matching entry in the report file last_entry=`egrep "ABNORMAL_QUERY" output|tail−1‘echo"(count) lastentry"exitSTATE_CRITICAL
fi
# HDFS usage
dfs_remaining=`curl -s http://HADOOPWEBINTERFACE:50070/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo|egrep−o"PercentRemaining.∗"|egrep−o"[0−9]∗\.[0−9]∗"‘dfsremainingword="DFSRemaining{dfs_remaining}%"
echo "dfsremainingword">>output
# check HDFS usage
dfs_remaining=`echo dfsremaining|awk−F′.″print$1′‘if[dfs_remaining -lt DFSREMAININGCRITICAL];thenecho"LowDFSspace.dfs_remaining_word"
exit_status=STATECRITICALelif[dfs_remaining -lt DFSREMAININGWARNING];thenecho"LowDFSspace.dfs_remaining_word"
exit_status=STATEWARNINGelseecho"HBasecheckOK−DFSandHBasehealthy.dfs_remaining_word"
exit_status=STATEOKfiexitexit_status
2.针对hadoop1的脚本:
#/bin/bash
bin=`dirname 0‘bin=‘cdbin;pwd`
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3
STATE_DEPENDENT=4
source /etc/profile
DFS_REMAINING_WARNING=15
DFS_REMAINING_CRITICAL=5
ABNORMAL_QUERY="INCONSISTENT|CORRUPT|FAILED|Exception"
HADOOP_WEB_INTERFACE= hadoop的Namenode对外接口ip
# hbck and fsck report
output=/data/logs/cluster-status
HBASEHOME/bin/hbasehbck>>output
HADOOPHOME/bin/hadoopfsck/hbase>>output
# check report
count=`egrep -c "ABNORMALQUERY"output`
if [ $count -eq 0 ]; then
echo "[OK] Cluster is healthy." >> $output
else
echo "[ABNORMAL] Cluster is abnormal!" >> $output
# Get the last matching entry in the report file
last_entry=`egrep "ABNORMALQUERY"output | tail -1`
echo "(count)last_entry"
exit STATE_CRITICAL
fi
# Check RegionServer Status
dead_region_servers=`curl -s http://{HADOOP_WEB_INTERFACE}:60010/master-status | grep "Dead Region Servers" -A 500 | grep "Regions in Transition" -B 500 | egrep -o 'target="_blank">.*</a>' | awk -F">" '{print 2}' | awk -F"<" '{print $1}'`
if [ -zdead_region_servers ];then
echo "[OK] All RegionServers is healthy."
echo "[OK] All RegionServers is healthy." >> $output
else
echo "[ABNORMAL] the dead regionserver list:" >> $output
echo deadregionservers>>output
exit $STATE_CRITICAL
fi
# HDFS usage
dfs_remaining=`curl -s http://HADOOPWEBINTERFACE:50070/dfshealth.jsp|egrep−o"DFSRemainingdfsremainingword="DFSRemaining{dfs_remaining}%"
echo "dfsremainingword">>output
# check HDFS usage
dfs_remaining=`echo dfsremaining|awk−F′.″print$1′‘if[dfs_remaining -lt $DFS_REMAINING_CRITICAL ]; then
echo "Low DFS space. $dfs_remaining_word"
exit_status=STATECRITICALelif[dfs_remaining -lt $DFS_REMAINING_WARNING ]; then
echo "Low DFS space. $dfs_remaining_word"
exit_status=$STATE_WARNING
else
echo "HBase check OK - DFS and HBase healthy.
$dfs_remaining_word"
exit_status=STATEOKfiexitexit_status