prometheus 报警规则样例
groups:
- name: "带宽检测"
rules:
- alert: "带宽告警"
expr: (irate(node_network_transmit_bytes_total{device!~"lo"}[1m]) / 1000) > 2000000
for: 30s
annotations:
summary: "{{ $labels.job}} - {{ $labels.instance }} 平均带宽大于20M每秒"
description: "Prometheus 报警: 带宽大于20M每秒 \n 主机名: {{ $labels.hostname }}\n ip: {{ $labels.ip }}\n 当前值: {{ $value }} \n 应用: {{ $labels.app }} \n 可用区: {{$labels.region}} \n 产品线: {{ $labels.product }} \n"
groups:
- name: "cpu检测"
rules:
- alert: "cpu负载告警"
expr: (100-(avg(irate(node_cpu_seconds_total{mode="idle",job="node_exporter_alert"}[15m]))by (instance,hostname,region,app,product)) * 100) > 95
for: 5m
annotations:
value: " {{ $value }} "
summary: "{{ $labels.job}} - {{ $labels.instance }} CPU使用率高于95%"
description: "Prometheus 报警: cpu负载使用率超过98%\n 主机名: {{ $labels.hostname }}\n ip: {{ $labels.instance }}\n 当前值: {{ $value }} \n 应用: {{ $labels.app }} \n 可用区: {{$labels.region}} \n 产品线: {{ $labels.product }} \n"
groups:
- name: 磁盘报警
rules:
- alert: NodeDiskUsage
expr: (node_filesystem_size_bytes{job="node_exporter_alert"} - node_filesystem_avail_bytes{job="node_exporter_alert"}) / node_filesystem_size_bytes{job="node_exporter_alert"} * 100 > 85
for: 1m
labels:
severity: high
annotations:
value: " {{ $value }} "
description: "Prometheus 报警: 磁盘报警\n 主机名: {{ $labels.hostname }}\n ip: {{ $labels.ip }}\n 当前值: {{ $value }} \n 应用: {{ $labels.app }} \n 可用区: {{$labels.region}} \n 产品线: {{ $labels.product }} \n"
groups:
- name: "dnsmasq_check"
rules:
- alert: "dnsmasq check"
expr: namedprocess_namegroup_num_procs == 0
for: 10s
annotations:
summary: "dnsmasq down"
description: " dnsmasq , job: "
groups:
- name: dns-resolv-alarm
rules:
- alert: dns_resolv_error
expr: dns{job="pushgateway"} == 0
for: 1m
labels:
team: op
annotations:
summary: "[DNS解析报警] [{{$labels.exported_instance}}] 域名解析失败"
description: "[DNS解析报警] [{{$labels.exported_instance}}] 域名解析报警"
groups:
- name: "http检测规则"
rules:
- alert: "http服务检测"
expr: probe_success{job="blackbox-http"} == 0
for: 2m
labels:
severity: warning
annotations:
description: "Prometheus 报警: http检测\n 实例: {{ $labels.instance }}\n 当前值: {{ $value }} \n 应用: {{ $labels.app }} \n 产品线: {{ $labels.product }} \n"
groups:
- name: "java检测规则"
rules:
- alert: "java服务检测"
expr: count without (name,version)(irate(jmx_exporter_build_info[1m])) == 0
for: 1m
labels:
env: dev
annotations:
description: "java应用 job: {{ $labels.job }} \n instance: {{ $labels.instance }} "
summary: "java检测"
groups:
- name: "load 报警"
rules:
- alert: "load负载告警"
expr: node_load15{job="node_exporter_alert"} > 50
for: 1m
annotations:
description: "Prometheus 报警: load高于50\n 主机名: {{ $labels.hostname }}\n ip: {{ $labels.ip }}\n 当前值: {{ $value }} \n 应用: {{ $labels.app }} \n 可用区: {{$labels.region}} \n 产品线: {{ $labels.product }} \n"
groups:
- name: "logstash检测规则"
rules:
- alert: "logstash服务检测"
expr: logstash_info_node == 0
for: 1m
annotations:
description: "日志收集服务 , ip: {{ $labels.instance }} "
summary: "logstash检测"
groups:
- name: "内存检测"
rules:
- alert: "内存检测大于4G"
expr: 100 - ( node_memory_Cached_bytes{job="node_exporter_alert"} + node_memory_Buffers_bytes{job="node_exporter_alert"} + node_memory_MemFree_bytes{job="node_exporter_alert"} ) / (node_memory_MemTotal_bytes{job="node_exporter_alert"} > 5000000000 ) * 100 > 98
for: 30s
labels:
severity: critical
annotations:
description: "Prometheus 报警: 内存使用率超过98%\n 主机名: {{ $labels.hostname }}\n ip: {{ $labels.ip }}\n 当前值: {{ $value }} \n 应用: {{ $labels.app }} \n 可用区: {{$labels.region}} \n 产品线: {{ $labels.product }} \n"
- alert: "内存检测小于4G"
expr: 100 - ( node_memory_Cached_bytes{job="node_exporter_alert"} + node_memory_Buffers_bytes{job="node_exporter_alert"} + node_memory_MemFree_bytes{job="node_exporter_alert"} ) / (node_memory_MemTotal_bytes{job="node_exporter_alert"} <= 5000000000 ) * 100 > 98
for: 30s
labels:
severity: critical
annotations:
description: "Prometheus 报警: 内存使用率超过98%\n 主机名: {{ $labels.hostname }}\n ip: {{ $labels.ip }}\n 当前值: {{ $value }} \n 应用: {{ $labels.app }} \n 可用区: {{$labels.region}} \n 产品线: {{ $labels.product }}\n"
groups:
- name: "RedisClient"
rules:
- alert: "Clients20k"
expr: redis_connected_clients > 20000
for: 5m
labels:
severity: warning
annotations:
summary: "Redis 实例 {{ $labels.addr }} client 超 20k "
description: "Redis {{ $labels.instance }} client 超过 20k \n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
groups:
- name: "RedisMem"
rules:
- alert: "OutOfMemory60"
expr: redis_memory_used_bytes / node_memory_MemTotal_bytes > 60
for: 5m
labels:
severity: warning
annotations:
summary: "Redis 实例 {{ $labels.addr }} 内存超过 60% "
description: "Redis {{ $labels.instance }} 内存超过 60% \n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: "OutOfMemory70"
expr: redis_memory_used_bytes / node_memory_MemTotal_bytes > 70
for: 5m
labels:
severity: warning
annotations:
summary: "Redis 实例 {{ $labels.addr }} 内存超过 70% "
description: "Redis {{ $labels.instance }} 内存超过 70% \n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: "OutOfMemory80"
expr: redis_memory_used_bytes / node_memory_MemTotal_bytes > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Redis 实例 {{ $labels.addr }} 内存超过 80% "
description: "Redis {{ $labels.instance }} 内存超过 80% \n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: "OutOfMemory90"
expr: redis_memory_used_bytes / node_memory_MemTotal_bytes > 90
for: 5m
labels:
severity: critical
annotations:
summary: "Redis 实例 {{ $labels.addr }} 内存超过 90% "
description: "Redis {{ $labels.instance }} 内存超过 90% \n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
groups:
- name: "RedisUP"
rules:
- alert: "RedisDown"
expr: redis_up == 0
for: 1m
labels:
severity: warning
annotations:
summary: "Redis down"
description: " Redis {{ $labels.instance }} is down \n VALUE = {{ $value }}\n"
groups:
- name: "tcp服务检测"
rules:
- alert: "tcp服务检测"
expr: probe_success{job="blackbox-tcp"} == 0 or probe_success{job="redis-sync"} == 0
for: 1m
labels:
severity: warning
annotations:
value: " {{ $value }} "
description: "Prometheus 报警: tcp检测\n 实例: {{ $labels.instance }}\n 当前值: {{ $value }} \n 应用: {{ $labels.app }} \n 产品线: {{ $labels.product }} \n"
groups:
- name: zookeeperStatsAlert
rules:
- alert: 堆积请求数过大
expr: avg(zk_outstanding_requests) by (instance) > 10
for: 1m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} "
description: "积请求数过大"
- alert: 阻塞中的 sync 过多
expr: avg(zk_pending_syncs) by (instance) > 10
for: 1m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} "
description: "塞中的 sync 过多"
- alert: 平均响应延迟过高
expr: avg(zk_avg_latency) by (instance) > 10
for: 1m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} "
description: '平均响应延迟过高'
- alert: 打开文件描述符数大于系统设定的大小
expr: zk_open_file_descriptor_count > zk_max_file_descriptor_count * 0.85
for: 1m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} "
description: '打开文件描述符数大于系统设定的大小'
- alert: zookeeper服务器宕机
expr: zk_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} "
description: 'zookeeper服务器宕机'
- alert: zk主节点丢失
expr: absent(zk_server_state{state="leader"}) != 1
for: 1m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} "
description: 'zk主节点丢失'