安装 prometheus
docker run \
--name=prometheus \
--volume=/data/app/prometheus/config/:/etc/prometheus/ \
--volume=/etc/localtime:/etc/localtime \
--volume=/data/app/prometheus/data:/prometheus \
--network=host \
--expose=9090/tcp \
--restart=always \
prom/prometheus \
--config.file=/etc/prometheus/prometheus.yml --web.enable-lifecycle
配置文件(prometheus)
#my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- XXX:9093
rule_files:
- /etc/prometheus/rules.yml
scrape_configs:
- job_name: 'XXX'
scrape_interval: 10s
file_sd_configs:
- refresh_interval: 1m
files: ["/etc/prometheus/json/*.json"]
rules配置文件(sample)
groups:
- name: test
rules:
- alert: InstanceDown
expr: up == 0
for: 5s
labels:
severity: fire
alertName: Instance
annotations:
summary: "Target:{{ $labels.instance }} Down"
description: "{{ $labels.instance }} has been down for more than 5s"
- alert: CPUUsage 90%
expr: 100 - (avg by (instance,hostName,hostIp)(irate(node_cpu_seconds_total{mode="idle"}[1m]) )) * 100 > 90
for: 1m
labels:
severity: fire
alertName: CPU
annotations:
summary: "Target:{{ $labels.instance }} CPU使用率超过90%"
description: "{{ $labels.instance }} CPU使用率超过90% 已经持续1m"
- alert: CPUUsage 85%
expr: 100 - (avg by (instance,hostName,hostIp)(irate(node_cpu_seconds_total{mode="idle"}[1m]) )) * 100 > 85
for: 1m
labels:
severity: critical
alertName: CPU
annotations:
summary: "Target:{{ $labels.instance }} CPU使用率超过85%"
description: "{{ $labels.instance }} CPU使用率超过85% 已经持续1m"
- alert: CPUUsage 70%
expr: 100 - (avg by (instance,hostName,hostIp)(irate(node_cpu_seconds_total{mode="idle"}[1m]) )) * 100 > 70
for: 5m
labels:
severity: warning
alertName: CPU
annotations:
summary: "Target:{{ $labels.instance }} CPU使用率超过70%"
description: "{{ $labels.instance }} CPU使用率超过70% 已经持续5m"
- alert: MEMUsage 90%
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 90
for: 1m
labels:
severity: fire
alertName: MEM
annotations:
summary: "Target:{{ $labels.instance }} 内存使用率超过90%"
description: "{{ $labels.instance }} 内存使用率超过90% 已经持续1m"
- alert: MEMUsage 80%
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 80
for: 1m
labels:
severity: critical
alertName: MEM
annotations:
summary: "Target:{{ $labels.instance }} 内存使用率超过80%"
description: "{{ $labels.instance }} 内存使用率超过80% 已经持续1m"
- alert: MEMUsage 70%
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 70
for: 5m
labels:
severity: warning
alertName: MEM
annotations:
summary: "Target:{{ $labels.instance }} 内存使用率超过70%"
description: "{{ $labels.instance }} 内存使用率超过70% 已经持续5m"
- alert: DISKUsage 90%
expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100 > 90
for: 1m
labels:
severity: fire
alertName: DISK
annotations:
summary: "Target:{{ $labels.instance }} 磁盘使用率超过90%"
description: "{{ $labels.instance }} 磁盘使用率超过90% 已经持续1m"
- alert: DISKUsage 85%
expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100 > 80
for: 1m
labels:
severity: critical
alertName: DISK
annotations:
summary: "Target:{{ $labels.instance }} 磁盘使用率超过80%"
description: "{{ $labels.instance }} 磁盘使用率超过80% 已经持续1m"
- alert: DISKUsage 70%
expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100 > 70
for: 5m
labels:
severity: warning
alertName: DISK
annotations:
summary: "Target:{{ $labels.instance }} 磁盘使用率超过70%"
description: "{{ $labels.instance }} 磁盘使用率超过70% 已经持续5m"
安装 grafana
docker run \
--name=grafana \
-p 3000:3000 \
--restart=always \
grafana/grafana
安装 alertmanager
docker run \
--name=alertmanager \
--volume=/data/app/alertmanager/conf/alertmanager.yml:/etc/alertmanager/alertmanager.yml \
--network=host \
--restart=no \
prom/alertmanager
配置文件
global:
resolve_timeout: 30m
route:
group_by: ['instance']
group_wait: 1s
group_interval: 1s
repeat_interval: 5m
receiver: "webhook"
receivers:
- name: "webhook"
webhook_configs:
- url: 'http://xxx/xx/xx/'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'fire'
target_match:
severity: 'critical'
equal: ['alertName', 'instance']
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertName','alertName']