安装Prometheus
tar -zxvf prometheus-2.34.0.linux-amd64.tar.gz
mv prometheus-2.34.0.linux-amd64 prometheus
vim prometheus.yml
# my global config global: scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. # scrape_timeout is set to the global default (10s). # Alertmanager configuration alerting: alertmanagers: - static_configs: - targets: - localhost:9093 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files: - "rules/host_rules.yml" # - "first_rules.yml" # - "second_rules.yml" # A scrape configuration containing exactly one endpoint to scrape: # Here it's Prometheus itself. scrape_configs: # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config. - job_name: "prometheus" # metrics_path defaults to '/metrics' # scheme defaults to 'http'. static_configs: - targets: ["localhost:9090"] - job_name: 'agent-web01' static_configs: - targets: ['172.31.32.104:9100'] - job_name: 'agent-web02' static_configs: - targets: ['172.31.29.223:9100'] - job_name: 'java' static_configs: - targets: ['172.31.29.223:8100'] metrics_path: '/actuator/prometheus'
创建规则
mkdir -p /root/prometheus/rules
cat host_rules.yml
groups: - name: 系统资源告警规则 rules: - alert: CPU使用率告警 expr: 100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle"}[1m]) )) * 100 > 80 for: 1m labels: user: prometheus severity: warning annotations: description: "服务器: CPU使用超过80%!(当前值: {{ humanize $value }}%)" - alert: 内存使用率告警 expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 80 for: 1m labels: user: prometheus severity: warning annotations: description: "服务器: 内存使用超过80%!(当前值: {{ humanize $value }}%)" - alert: 磁盘告警规则 expr: 100 - (node_filesystem_free_bytes{mountpoint="/",fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"} * 100) > 70 for: 1m labels: user: prometheus severity: warning annotations: description: "服务器: 磁盘使用超过70%!(当前值: {{ humanize $value }}%)"
启动
nohup ./prometheus &
效果图
安装alertmanager
tar -zxvf alertmanager-0.24.0.linux-amd64.tar.gz
mv alertmanager-0.24.0.linux-amd64 alertmanager
vim alertmanager.yml
具体可以去企业微信后台查找相关参数
global: resolve_timeout: 2m wechat_api_url: 'https://qyapi.weixin.qq.com/cgi-bin/' wechat_api_secret: '<你的企业微信secret>' wechat_api_corp_id: '<你的企业微信id>' route: group_by: ['alertname'] group_wait: 10s group_interval: 10s repeat_interval: 1h receiver: 'wechat' receivers: - name: 'wechat' wechat_configs: - send_resolved: true to_party: '1' agent_id: '<你的企业微信应用id>' templates: - '/alertmanager/*.tmpl'
yaml语法检查
./amtool check-config alertmanager.yml
企业微信报警模板
cat wechat.tmpl
{{ define "wechat.default.message" }} {{- if gt (len .Alerts.Firing) 0 -}} {{- range $index, $alert := .Alerts -}} ======== 异常告警 ======== 告警名称:{{ $alert.Labels.alertname }} 告警级别:{{ $alert.Labels.severity }} 告警机器:{{ $alert.Labels.instance }} {{ $alert.Labels.device }} 告警详情:{{ $alert.Annotations.summary }} 告警时间:{{ $alert.StartsAt.Format "2006-01-02 15:04:05" }} ========== END ========== {{- end }} {{- end }} {{- if gt (len .Alerts.Resolved) 0 -}} {{- range $index, $alert := .Alerts -}} ======== 告警恢复 ======== 告警名称:{{ $alert.Labels.alertname }} 告警级别:{{ $alert.Labels.severity }} 告警机器:{{ $alert.Labels.instance }} 告警详情:{{ $alert.Annotations.summary }} 告警时间:{{ $alert.StartsAt.Format "2006-01-02 15:04:05" }} 恢复时间:{{ $alert.EndsAt.Format "2006-01-02 15:04:05" }} ========== END ========== {{- end }} {{- end }} {{- end }}
启动
nohup ./alertmanager &
安装Grafana
wget https://dl.grafana.com/oss/release/grafana-8.4.6.linux-amd64.tar.gz
tar -zxvf grafana-8.4.6.linux-amd64.tar.gz
mv grafana-8.4.6.linux-amd64 grafana
配置未更改,走的默认,也可以根据具体情况做相应的配置改动
启动
nohup ./grafana &
配置数据源为Prometheus
因为在本地,所以首选localhost
保存并测试
导入配置
grafana dashboard地址
https://grafana.com/grafana/dashboards
11074 ------1 Node Exporter for Prometheus Dashboard EN 20201010
8919 -------1 Node Exporter Dashboard 22/04/13 ConsulManager自动同步版
1860 -------Node Exporter Full
4701 -------JVM (Micrometer)
参考链接
https://blog.51cto.com/u_15060547/3817600 https://www.cnblogs.com/Devinhao/articles/16180018.html