最近负责的几个省的项目,问题频出,经常接到运维同事这样的反馈
- jvm cpu%
- ogg 断了
- oracle 归档日志满了
以上原因可能导致数据不能正常接收并传输,因此还要想办法补救,运维同学焦头烂额,因此半夜开发接到电话是很正常的。那么问题来了有没有解决办法,能及时提醒?答案是肯定的!
系统信息监控
运维离不开对系统信息的监控,如CPU的使用率、内存的占用情况、网络、进程等相关信息都需要被监控,虽然我们可以通过操作系统提供的任务管理器或命令查看相关信息,但仍不能简化这些日常的运维任务。如果我们通过编写程序获取以上信息,那么系统信息监控就是一件轻松而简单的工作。
在Python中获取系统信息最便捷的模块是psutil(process and systemutilities)。通过简短的几行代码就可以获取系统相关信息,而且还是跨平台库。
psutil
psutil(进程和系统实用程序)是一个跨平台库,用于检索 Python 中有关正在运行的进程和系统利用率(CPU、内存、磁盘、网络、传感器)的信息。它主要用于系统监控、分析和限制进程资源以及运行过程的管理。它实现了经典UNIX命令行工具提供的许多功能_,如ps、顶部、iotop、lsof、netstat、ifconfig、free_和免费等。
git地址https://github.com/giampaolo/psutil
psutil不属于标准库,需要手动安装。安装psutil非常简单,执行以下命令即可。
pip install psutil
CPU监控
>>> import psutil >>> >>> psutil.cpu_times() scputimes(user=3961.46, nice=169.729, system=2150.659, idle=16900.540, iowait=629.59, irq=0.0, softirq=19.42, steal=0.0, guest=0, nice=0.0) >>> >>> for x in range(3): ... psutil.cpu_percent(interval=1) ... 4.0 5.9 3.8 >>> >>> for x in range(3): ... psutil.cpu_percent(interval=1, percpu=True) ... [4.0, 6.9, 3.7, 9.2] [7.0, 8.5, 2.4, 2.1] [1.2, 9.0, 9.9, 7.2] >>> >>> for x in range(3): ... psutil.cpu_times_percent(interval=1, percpu=False) ... scputimes(user=1.5, nice=0.0, system=0.5, idle=96.5, iowait=1.5, irq=0.0, softirq=0.0, steal=0.0, guest=0.0, guest_nice=0.0) scputimes(user=1.0, nice=0.0, system=0.0, idle=99.0, iowait=0.0, irq=0.0, softirq=0.0, steal=0.0, guest=0.0, guest_nice=0.0) scputimes(user=2.0, nice=0.0, system=0.0, idle=98.0, iowait=0.0, irq=0.0, softirq=0.0, steal=0.0, guest=0.0, guest_nice=0.0) >>> >>> psutil.cpu_count() 4 >>> psutil.cpu_count(logical=False) 2 >>> >>> psutil.cpu_stats() scpustats(ctx_switches=20455687, interrupts=6598984, soft_interrupts=2134212, syscalls=0) >>> >>> psutil.cpu_freq() scpufreq(current=931.42925, min=800.0, max=3500.0) >>> >>> psutil.getloadavg() # also on Windows (emulated) (3.14, 3.89, 4.67)
内存监控
>>> psutil.virtual_memory() svmem(total=10367352832, available=6472179712, percent=37.6, used=8186245120, free=2181107712, active=4748992512, inactive=2758115328, buffers=790724608, cached=3500347392, shared=787554304) >>> psutil.swap_memory() sswap(total=2097147904, used=296128512, free=1801019392, percent=14.1, sin=304193536, sout=677842944) >>>
硬盘监控
>>> psutil.disk_partitions() [sdiskpart(device='/dev/sda1', mountpoint='/', fstype='ext4', opts='rw,nosuid', maxfile=255, maxpath=4096), sdiskpart(device='/dev/sda2', mountpoint='/home', fstype='ext, opts='rw', maxfile=255, maxpath=4096)] >>> >>> psutil.disk_usage('/') sdiskusage(total=21378641920, used=4809781248, free=15482871808, percent=22.5) >>> >>> psutil.disk_io_counters(perdisk=False) sdiskio(read_count=719566, write_count=1082197, read_bytes=18626220032, write_bytes=24081764352, read_time=5023392, write_time=63199568, read_merged_count=619166, write_merged_count=812396, busy_time=4523412) >>>
进程监控
>>> import psutil >>> psutil.pids() [1, 2, 3, 4, 5, 6, 7, 46, 48, 50, 51, 178, 182, 222, 223, 224, 268, 1215, 1216, 1220, 1221, 1243, 1244, 1301, 1601, 2237, 2355, 2637, 2774, 3932, 4176, 4177, 4185, 4187, 4189, 4225, 4243, 4245, 4263, 4282, 4306, 4311, 4312, 4313, 4314, 4337, 4339, 4357, 4358, 4363, 4383, 4395, 4408, 4433, 4443, 4445, 4446, 5167, 5234, 5235, 5252, 5318, 5424, 5644, 6987, 7054, 7055, 7071] >>> >>> p = psutil.Process(7055) >>> p psutil.Process(pid=7055, name='python3', status='running', started='09:04:44') >>> p.name() 'python' >>> p.exe() '/usr/bin/python' >>> p.cwd() '/home/giampaolo' >>> p.cmdline() ['/usr/bin/python', 'main.py'] >>> >>> p.pid 7055 >>> p.ppid() 7054 >>> p.children(recursive=True) [psutil.Process(pid=29835, name='python3', status='sleeping', started='11:45:38'), psutil.Process(pid=29836, name='python3', status='waking', started='11:43:39')] >>> >>> p.parent() psutil.Process(pid=4699, name='bash', status='sleeping', started='09:06:44') >>> p.parents() [psutil.Process(pid=4699, name='bash', started='09:06:44'), psutil.Process(pid=4689, name='gnome-terminal-server', status='sleeping', started='0:06:44'), psutil.Process(pid=1, name='systemd', status='sleeping', started='05:56:55')] >>> >>> p.status() 'running' >>> p.username() 'giampaolo' >>> p.create_time() 1267551141.5019531 >>> p.terminal() '/dev/pts/0' >>> >>> p.uids() puids(real=1000, effective=1000, saved=1000) >>> p.gids() pgids(real=1000, effective=1000, saved=1000) >>> >>> p.cpu_times() pcputimes(user=1.02, system=0.31, children_user=0.32, children_system=0.1, iowait=0.0) >>> p.cpu_percent(interval=1.0) 12.1 >>> p.cpu_affinity() [0, 1, 2, 3] >>> p.cpu_affinity([0, 1]) # set >>> p.cpu_num() 1 >>> >>> p.memory_info() pmem(rss=10915840, vms=67608576, shared=3313664, text=2310144, lib=0, data=7262208, dirty=0) >>> p.memory_full_info() # "real" USS memory usage (Linux, macOS, Win only) pfullmem(rss=10199040, vms=52133888, shared=3887104, text=2867200, lib=0, data=5967872, dirty=0, uss=6545408, pss=6872064, swap=0) >>> p.memory_percent() 0.7823 >>> p.memory_maps() [pmmap_grouped(path='/lib/x8664-linux-gnu/libutil-2.15.so', rss=32768, size=2125824, pss=32768, shared_clean=0, shared_dirty=0, private_clean=20480, private_dirty=12288, referenced=32768, anonymous=12288, swap=0), pmmap_grouped(path='/lib/x8664-linux-gnu/libc-2.15.so', rss=3821568, size=3842048, pss=3821568, shared_clean=0, shared_dirty=0, private_clean=0, private_dirty=3821568, referenced=3575808, anonymous=3821568, swap=0), pmmap_grouped(path='[heap]', rss=32768, size=139264, pss=32768, shared_clean=0, shared_dirty=0, private_clean=0, private_dirty=32768, referenced=32768, anonymous=32768, swap=0), pmmap_grouped(path='[stack]', rss=2465792, size=2494464, pss=2465792, shared_clean=0, shared_dirty=0, private_clean=0, private_dirty=2465792, referenced=2277376, anonymous=2465792, swap=0), ...] >>> >>> p.io_counters() pio(read_count=478001, write_count=59371, read_bytes=700416, write_bytes=69632, read_chars=456232, write_chars=517543) >>> >>> p.open_files() [popenfile(path='/home/giampaolo/monit.py', fd=3, position=0, mode='r', flags=32768), popenfile(path='/var/log/monit.log', fd=4, position=235542, mode='a', flags=33793)] >>> >>> p.connections(kind='tcp') [pconn(fd=115, family=<AddressFamily.AF_INET: 2>, type=<SocketType.SOCK_STREAM: 1>, laddr=addr(ip='10.0.0.1', port=48776), raddr=addr(ip='93.186.135.91', port=80), status='ESTABLISHED'), pconn(fd=117, family=<AddressFamily.AF_INET: 2>, type=<SocketType.SOCK_STREAM: 1>, laddr=addr(ip='10.0.0.1', port=43761), raddr=addr(ip='72.14.234.100', port=80), status='CLOSING')] >>> >>> p.num_threads() 4 >>> p.num_fds() 8 >>> p.threads() [pthread(id=5234, user_time=22.5, system_time=9.2891), pthread(id=5237, user_time=0.0707, system_time=1.1)] >>> >>> p.num_ctx_switches() pctxsw(voluntary=78, involuntary=19) >>> >>> p.nice() 0 >>> p.nice(10) # set >>> >>> p.ionice(psutil.IOPRIO_CLASS_IDLE) # IO priority (Win and Linux only) >>> p.ionice() pionice(ioclass=<IOPriority.IOPRIO_CLASS_IDLE: 3>, value=0) >>> >>> p.rlimit(psutil.RLIMIT_NOFILE, (5, 5)) # set resource limits (Linux only) >>> p.rlimit(psutil.RLIMIT_NOFILE) (5, 5) >>> >>> p.environ() {'LC_PAPER': 'it_IT.UTF-8', 'SHELL': '/bin/bash', 'GREP_OPTIONS': '--color=auto', 'XDG_CONFIG_DIRS': '/etc/xdg/xdg-ubuntu:/usr/share/upstart/xdg:/etc/xdg', ...} >>> >>> p.as_dict() {'status': 'running', 'num_ctx_switches': pctxsw(voluntary=63, involuntary=1), 'pid': 5457, ...} >>> p.is_running() True >>> p.suspend() >>> p.resume() >>> >>> p.terminate() >>> p.kill() >>> p.wait(timeout=3) <Exitcode.EX_OK: 0> >>> >>> psutil.test() USER PID %CPU %MEM VSZ RSS TTY START TIME COMMAND root 1 0.0 0.0 24584 2240 Jun17 00:00 init root 2 0.0 0.0 0 0 Jun17 00:00 kthreadd ... giampaolo 31475 0.0 0.0 20760 3024 /dev/pts/0 Jun19 00:00 python2.4 giampaolo 31721 0.0 2.2 773060 181896 00:04 10:30 chrome root 31763 0.0 0.0 0 0 00:05 00:00 kworker/0:1 >>>
扩展想象
- 利用该工具检测ogg进程
- 利用该工具检测cpu 100% dump thread,保留线索。
- 整合邮件api和短信api进行发送检测结果
更多精彩推荐
Glances - An eye on your system
一个跨平台的监视工具,web可视化监控平台。
https://github.com/nicolargo/glances