1. 作者
muyuan.y@alibaba-inc.com
yufeng.s@alibaba-inc.com
https://github.com/mYu4N/bpftrace/blob/main/mypod-lifecycle.py
2. 核心痛点
- 成本原因未开启“Pod事件监控”
- 开启,但因种种意外导致数据未上报至SLS
- 排查问题时,需要人肉分析日志和对应时间点以及先后关系,从而带来的大量额外时间开销和pod事件错漏问题
3. ACK官网效果
4. 辅助工具最终效果
4.1. 使用方法
4.2. 功能细节简述
- 分析离线日志,结合专业的k8s专家问题排查经验,根据k8s pod的离线日志抓取、检测特定的pod event
- 根据配置文件的event等级、args参数的pod名称对数据进行过滤用以对不同pod做不同维度和等级的分析
- pod event数据默认按照秒聚合展示在时间轴两侧,并根据event等级显示不同的颜色以展示重要性
- pod生命周期会以svg的格式保存至本地用于离线分析或技术交流
- svg本地图片将会默认注释并保存整个生命周期内,某一秒因为事件太多而缩略展示的pod event完整始末。
- matplotlib绘图控件click hook支持点击显示某一时刻所有pod event的细节信息
- matplotlib绘图控件slider hook支持拖拽时间轴以观察某一端时间的pod event,避免因为event太多,事件不够导致的图像紧凑
- matplotlib绘图控件支持拖拽时间轴到最左侧以展示整个pod 生命周期总览
4.3. svg本地图片效果
4.4. matplotlib交互控件效果
5. 源码
#author muyuan.y yufeng.s# python3 mypod-lifecycle.py --podname jpprod-oversea-user-message-server-0 --logfile messages --eventlevel Info,Normal,Warning,Error,FatalimportargparsefromtypingimportListimportmatplotlib.pyplotaspltimportnumpyasnpimportreimportdatetimeimportpandasaspdfromcollectionsimportCounterfrommatplotlib.backend_basesimportPickEventimportmathimportloggingfrommatplotlib.widgetsimportSliderplt.rcParams['font.sans-serif'] = ['SimHei', 'Songti SC', 'STFangsong'] plt.rcParams['axes.unicode_minus'] =Falseparser=argparse.ArgumentParser(description="display pod lifecycle") parser.add_argument("--podname", default=None, type=str, help="name of pod, default=None") parser.add_argument("--logfile", default=None, type=str, help="log file path of pod, default=None") parser.add_argument("--eventlevel", default="Normal,Warning,Error,Fatal", type=str, help="event level configure in POD_EVENT_CONFIG::level_desc, default=Normal,Warning,Error,Fatal") parser.add_argument("--loggerlevel", default="INFO", type=str, help="script runtime logger level, default=INFO") args=parser.parse_args() LOG_LEVEL_MAP= { "DEBUG": logging.DEBUG, "INFO": logging.INFO, "ERROR": logging.ERROR, "WARNING": logging.WARNING} classMyLogger: def__init__(self, log_level=LOG_LEVEL_MAP.get(args.loggerlevel)): self.logger=logging.getLogger(__name__) self.logger.setLevel(log_level) console_handler=logging.StreamHandler() console_handler.setLevel(log_level) formatter=logging.Formatter('[%(asctime)s] - %(name)s - %(levelname)s - %(message)s') console_handler.setFormatter(formatter) self.logger.addHandler(console_handler) definfo(self, message): self.logger.info(message) deferror(self, message): self.logger.error(message) defdebug(self, message): self.logger.debug(message) defwarning(self, msg): self.logger.warning(msg) logger=MyLogger() # 标签描述默认显示如下level中等级最高的颜色和level描述LEVEL_MAP= { 'Info': 0, 'Normal': 1, 'Warning': 2, 'Error': 3, 'Fatal': 4} # 不同level的显示颜色mappingCOLOR_MAP= { 'Info': 'whitesmoke', # 灰色'Normal': 'lightgreen', # 亮绿色'Warning': 'darkorange', # 橙黄色'Error': 'red', 'Fatal': 'darkred'} # alias是展示名称,如果不写默认使用log里匹配的keyPOD_EVENT_CONFIG= { 'Container started': { 'level_desc': "Normal" }, 'Created container': { 'bbox_color_show': 'lightgreen', 'level_desc': "Normal" }, 'Started container': { 'bbox_color_show': 'lightgreen', 'level_desc': "Normal" }, 'SyncLoop ADD': { 'alias': 'ADD POD', 'level_desc': "Normal" }, 'SyncLoop UPDATE': { 'alias': 'Update POD', 'level_desc': "Normal" }, 'SyncLoop DELETE': { 'alias': 'Delete POD', 'level_desc': "Normal" }, 'Probe succeeded': { 'bbox_color_show': 'whitesmoke', 'level_desc': "Info" }, 'Reason:ContainersNotReady': { 'alias': 'ContainersNotReady', 'level_desc': "Warning" }, 'Readiness probe failed': { 'alias': 'Readiness failed', 'level_desc': "Warning" }, 'Liveness probe failed': { 'alias': 'Liveness failed', 'level_desc': "Warning" }, 'Killing unwanted container': { 'alias': 'Killing unwanted', 'level_desc': "Warning" }, 'Container exited normally': { 'alias': 'Container exited', 'level_desc': "Normal" }, 'Killing container': { 'bbox_color_show': '#FFA500', 'level_desc': "Warning" }, 'will be restarted': { 'alias': 'Pod restart', 'level_desc': "Warning" }, 'SyncLoop REMOVE': { 'alias': 'REMOVE POD', 'level_desc': "Normal" }, 'Pod was deleted and then recreated': { 'alias': 'Pod Recreated', 'level_desc': "Warning" }, 'Pod has been deleted and must be killed': { 'alias': 'Pod delete & kill', 'level_desc': "Normal" }, 'Pod does not exist on the server': { 'alias': 'Pod not exist', 'level_desc': "Normal" }, } defconfig_check_and_process(): fork, vinPOD_EVENT_CONFIG.items(): try: # assert v['bbox_color_show'], f'event: [{k}] has no attribute "bbox_color_show"'# 匹配level的优先级level=LEVEL_MAP[v['level_desc']] POD_EVENT_CONFIG[k]['level'] =level# 匹配level的颜色color=COLOR_MAP[v['level_desc']] POD_EVENT_CONFIG[k]['bbox_color_show'] =color# 如果没有设置alias(展示用) 默认用event名ifnotv.get('alias'): POD_EVENT_CONFIG[k]['alias'] =kexceptExceptionase: logger.error('config setting error') logger.error(e) logger.info('config dict check done') config_check_and_process() classEventCounter(Counter): def__str__(self) ->str: super().__str__() _infos= [] for_k, _vinself.items(): _infos.append(str(_k) +':'+str(_v)) return'\n'.join(_infos) defevent_agg(events: pd.Series) ->pd.Series: event_list=events.to_list() event_info=sorted([( POD_EVENT_CONFIG.get(event).get('level'), POD_EVENT_CONFIG.get(event).get('bbox_color_show'), POD_EVENT_CONFIG.get(event).get('level_desc'), POD_EVENT_CONFIG.get(event).get('alias'), ) foreventinevent_list], key=lambdax: x[0], reverse=True) alias_list= [_[3] for_inevent_info] event_counter=EventCounter(alias_list) returnpd.Series( { 'event_infos': '\n'.join(event_list), 'event_counter': str(event_counter), 'event_size': len(event_list), 'bbox_color_show': event_info[0][1], # 每秒取最高'level_desc': event_info[0][2], # 每秒取最高'alias': '->\n'.join(alias_list) } ) defdraw_time_text(events: List[str], dates: List[datetime.datetime]): event_df=pd.DataFrame({"event": events, "dates": dates}) # .to_clipboard()g=event_df['dates'].apply(lambdax: datetime.datetime.strftime(x, "%m-%d %H:%M:%S")) grouped=event_df.groupby(g) # for _g, _df in grouped:# print(_g, _df)event_summary=grouped.apply( lambdax: event_agg(x['event']) ) # 图表所需信息_ylabel= [] _xlim= [] _levels= [] _vert= [] _color= [] cnt=0click_content= [] svg_content= [] forindex, rowinevent_summary.iterrows(): info=''ifrow['event_size'] >=2: info+=row['event_counter'] +f'\n等级:'+row['level_desc'] svg_content.append((row['alias'].replace('\n', ' '), cnt+1)) else: info+=row['alias'] +'\n等级:'+row['level_desc'] _ylabel.extend([info, index]) _xlim.extend([cnt+1, cnt+1]) # _levels.extend([-3, 0]) if (cnt % 2 == 0) else _levels.extend([3, 0])ifcnt%2==0: _levels.extend([-1.5, 0]) ifcnt%4<2else_levels.extend([-3, 0]) else: _levels.extend([1.5, 0]) ifcnt%4<2else_levels.extend([3, 0]) _vert.extend(['top', 'bottom']) ifcnt%2==0else_vert.extend(['bottom', 'top']) _color.extend([row['bbox_color_show'], 'lightgreen']) click_content.append(row['alias'].replace('\n', ' ')) cnt+=1fig, ax=plt.subplots(figsize=(100, 10), constrained_layout=True) # 标题ax.set(title=f'Pod-lifecycle {args.podname}') # 添加线条, basefmt设置中线的颜色,linefmt设置线的颜色以及类型# 初步设想:level需要比较均匀的铺在这个上面,直接生成等差数列 然后用标签显示时间 和事件markerline, stemline, baseline=ax.stem(_xlim, _levels, linefmt="#00BFFF", basefmt="green", ) # 交点空心,zorder=3设置图层,mec="k"外黑 mfc="w"内白plt.setp(markerline, mec='#00FF00', mfc="w", zorder=3) # 通过将Y数据替换为零,将标记移到基线markerline.set_ydata(np.zeros(len(_xlim))) # 添加文字注释ford, l, r, va, colorinzip(_xlim, _levels, _ylabel, _vert, _color): logger.debug(f'annotate location param: \nd: {d}\nl: {l}\nr: {r}\nva: {va}\ncolor: {color}\n') ax.annotate(r, xy=(d, l), xytext=(0, np.sign(l) *3-5ifd%2==0else5), textcoords="offset points", va=va, ha="center", bbox=dict(boxstyle='round', facecolor=color, edgecolor='none', pad=0.2ifl==0else0.8)) # 设置图表的x轴范围为最小和最大日期ax.set_xlim(min(_xlim) -3, max(_xlim) +3) ax.set_ylim(-5, 5) # 逆时针30度,刻度右对齐# plt.setp(ax.get_xticklabels(), rotation=30, ha="right")# 隐藏轴线ax.get_yaxis().set_visible(False) ax.get_xaxis().set_visible(False) # 隐藏边框forspinein ["left", "top", "right", "bottom"]: ax.spines[spine].set_visible(False) # 边距仅设置y轴ax.margins(y=0.3) # svg本地保存图片 需要添加脚注svg_text_objs= [] foridx, (_content, _x) inenumerate(svg_content, start=1): logger.debug(f'[{idx}] writing text description on x={_x} desc:{_content}') _adj_diff=0.4_text_y_lim=_levels[_x*2-2] +_adj_diffif_levels[_x*2-2] <0else_levels[_x*2-2] -_adj_diff_text=ax.text(_x, _text_y_lim, f'[{_ylabel[_x*2-1]}] {_content}', fontsize=12, ha="center") svg_text_objs.append(_text) # 根据需要进行图表的调整和保存plt.tight_layout() plt.savefig(f'{args.podname}-Pod-lifecycle.svg') logger.info(f'save local image: {args.podname}-Pod-lifecycle.svg') iflen(_xlim) >=400: logger.warning( """ Too many X-axis elements may cause local image display to be congested. You can adjust the `figsize` bigger than (100,10) default or use the `eventlevel` parameter to filter events with low prompt levels """) # 控件点击事件即可显示全,删掉这部分展示仅用于绘制本地图片whilesvg_text_objs: _delete=svg_text_objs.pop() _delete.remove() # click回调设置clicks= [] defon_pick(event: PickEvent): logger.info(event.mouseevent) ifclicks: click=clicks.pop() click.remove() ifevent.mouseevent.button==1andevent.mouseevent.dblclick==0: x=event.mouseevent.xdatax_idx=math.floor(x+0.5) -1logger.debug(f'content x index: {x_idx}') if0<=x_idx<=len(click_content) -1: msg=click_content[x_idx] else: msg='请点击时间轴内的时间或事件描述以展示具体细节'logger.debug(f'content display: {msg}') click=ax.text(x_idx, 4.5, f'{msg}', fontsize=16, ha="center") clicks.append(click) plt.draw() ax.set_picker(True) fig.canvas.mpl_connect('pick_event', on_pick) # 创建一个Slider对象,用于控制横向拖拽ax_slider=plt.axes([0.1, 0.1, 0.65, 0.03]) slider=Slider(ax_slider, '时间轴', min(_xlim) -5, max(_xlim), valinit=0, valstep=0.01) # 默认展示前20ax.set_xlim(min(_xlim), min(_xlim) +20) # slider hookdefslider_update(val): # 获取Slider的值x_range=slider.valifx_range==min(_xlim) -5: ax.set_xlim(min(_xlim) -5, max(_xlim) +5) logger.debug( f'slider info: xlim({x_range},) label(展示总览,)') slider.valtext.set_text('展示总览') else: # 更新图形的x轴范围ax.set_xlim(x_range, x_range+20) x_show_left=math.floor(min(_xlim) ifx_range<min(_xlim) elsex_range) x_show_right=math.floor(max(_xlim) ifx_range+20>max(_xlim) elsex_range+20) # print(x_show_left, x_show_right)slider_label_show_left=_ylabel[x_show_left*2-1] slider_label_show_right=_ylabel[x_show_right*2-1] logger.debug( f'slider info: xlim({x_show_left},{x_show_right}) label({slider_label_show_left},{slider_label_show_right})') slider.valtext.set_text(' ~\n '.join([slider_label_show_left, slider_label_show_right])) fig.canvas.draw_idle() slider.on_changed(slider_update) plt.show() plt.show() if__name__=='__main__': target_keywords=POD_EVENT_CONFIG.keys() dates= [] events= [] event_level_filter=args.eventlevel.split(',') withopen(args.logfile, 'r') asfile: forlineinfile: ifargs.podnameinline: forkeywordintarget_keywords: ifPOD_EVENT_CONFIG.get(keyword).get('level_desc') inevent_level_filter: match=re.search(r'(\w{3} \d{2} \d{2}:\d{2}:\d{2}).+'+'{}'.format(keyword), line) ifmatch: # dates.append(match.group(1))dates.append(datetime.datetime.strptime(match.group(1), "%b %d %H:%M:%S")) events.append(keyword) breakiflen(dates) ==len(events) !=0: draw_time_text(events, dates) else: logger.error('event list is empty or log file is Incomplete') raiseValueError(f'data length: dates={len(dates)} events={len(events)} ,pleas check')