最近发生了一次因为hdfs的坏块导致的hive计算问题,因此写了个监控脚本用来监控hdfs的坏块,脚本如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/python
# -*- coding: utf8 -*-
#edit by ericni
#20140724
#monitor hdfs corrupt
import  sys
import  property
import  sendmail
import  re,os
reload (sys)
sys.setdefaultencoding( 'utf-8' )
if  __name__  = =  "__main__" :
         corruptlist  =  []
         cmd  =  "hadoop fsck -list-corruptfileblocks"
         re  =  os.popen(cmd)
         result  =  re.readlines()
         print  result
         for  line  in  result:
                 if  "blk_"  in  line  and  ".Trash"  not  in  line:
                 #if "blk_" in line:
                         corruptlist.append(line)
         if  len (corruptlist) ! =  0 :
                 mailcontent  =  """
                         <meta http-equiv="Content-Type" content="text/html";charset=utf-8>
                         <title>Hadoop集群坏块监控</title>
                           <style type="text/css">
                            body { font-size: 14px; color: #333;background-color: #fff;}
                            td { border: 1px solid #C1DAD7;padding:"4px 10px"; line-height: 24px;}       
                            table {border-collapse: collapse; width: 96%s;}
                            .divtd {color:#E28E40;}
                            .divcss5{ color:#F00;}
                           </style> """ % (" % ")
                 mailcontent  + =  """
         <p style="margin: 0; padding: 5px 0; line-height: 28px; font: bold 18px/1.5;">坏块数量 %s,具体信息如下:</p>
         <div>
                                 <table style="border-collapse: collapse; width: 96%s;">
                                         <tbody>
                                                 <tr>
                                                         <td><div class="divtd">序号</div></td>
                                                         <td><div class="divtd">块号</div></td>
                                                         <td><div class="divtd">文件信息</div></td>
                        
                                                 </tr>
                 """ % ( str ( len (corruptlist))," % ")
                 id  =  1
                 for  clist  in  corruptlist:
                         print  "blkid is "  +   clist.split()[ 0 ]   +  " file is "   +  clist.split()[ 1 ]
                         mailcontent  + =  """
                                  <tr>
                         """
                         mailcontent  + =  """<td><div>%s </div></td>"""  %  id
                         mailcontent  + =  """<td><div>%s </div></td>"""  %  clist.split()[ 0 ]
                         mailcontent  + =  """<td><div>%s </div></td>"""  %  clist.split()[ 1 ]
                         mailcontent  + =  """
                         </tr>
                         """
                         id  =  id  +  1
                 mailcontent  + =  "</tbody></table></div>"
                 print  mailcontent
                 mailto  =  [ 'ericni.ni@xxxxx' ]
                 subject  =  "Haddop集群坏块监控"
                 sendmail.send_mail_withoutSSL(subject,mailcontent.encode( 'utf-8' ),mailto)