下载地址
https:
直接下载jar包
wget https://repo1.maven.org/maven2/io/prometheus/jmx/jmx_prometheus_javaagent/0.17.0/jmx_prometheus_javaagent-0.17.0.jar
解压编译jar包
tar xf jmx_exporter-parent-0.16.1.tar.gz
cd jmx_exporter-parent-0.16.1/
./mvnw package
jar 保存放位置
cd jmx_exporter-parent-0.16.1/jmx_prometheus_javaagent/target/
目录结构
[root@node-1 xxl-job-admin]# ls
Dockerfile jmx_prometheus_javaagent-0.16.1.jar jmx.yaml pom.xml src target
dockerfile 模板
[root@node-1 xxl-job-admin]
FROM openjdk:8-jre-slim
MAINTAINER lhz
ENV PARAMS=""
ENV TZ=PRC
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
ADD jmx.yaml /jmx.yaml
ADD target/xxl-job-admin-*.jar /app.jar
ADD jmx_prometheus_javaagent-0.16.1.jar /jmx.jar
ENTRYPOINT ["sh","-c","java -jar -javaagent:/jmx.jar=7070:/jmx.yaml $JAVA_OPTS /app.jar $PARAMS"]
生成镜像
docker build -t "xxl:v1" .
配置文件
[root@node-1 xxl-job-admin]# cat jmx.yaml
---
startDelaySeconds: 0
hostPort: 0.0.0.0:1234
username:
password:
#jmxUrl: service:jmx:rmi:
ssl: false
lowercaseOutputName: false
lowercaseOutputLabelNames: false
whitelistObjectNames: ["org.apache.cassandra.metrics:*"]
blacklistObjectNames: ["org.apache.cassandra.metrics:type=ColumnFamily,*"]
rules:
- pattern: 'org.apache.cassandra.metrics<type=(\w+), name=(\w+)><>Value: (\d+)'
name: cassandra_$1_$2
value: $3
valueFactor: 0.001
labels: {}
help: "Cassandra metric $1 $2"
cache: false
type: GAUGE
attrNameSnakeCase: false
编排文件编写 把7070 端口映射出去
apiVersion: v1
kind: Service
metadata:
name: xxl
namespace: xxl
labels:
app: xxl
spec:
type: NodePort
ports:
- port: 5601
name: jdk
- port: 7070
targetPort: 7070
protocol: TCP
nodePort: 30001
name: jvm
selector:
app: xxl
ip+端口访问
告警规则 需要根据上面的模板修改
mkdir -pv /data/pkgs/prometheus/jvm-exporter/etc/conf.d/rules
vim /data/pkgs/prometheus/jvm-exporter/etc/conf.d/rules/rule_jvm_alert.yml
groups:
- name: jvm-alerting
rules:
# down了超过1分钟
- alert: instance-down
expr: up == 0
for: 1m
labels:
severity: 严重
team: 运维
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
# down了超过5分钟
- alert: instance-down
expr: up == 0
for: 5m
labels:
severity: 灾难
team: 运维
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
# 堆空间使用超过50%
- alert: heap-usage-too-much
expr: jvm_memory_bytes_used{job="jvm-exporter", area="heap"} / jvm_memory_bytes_max * 100 > 50
for: 1m
labels:
severity: 警告
team: 运维
annotations:
summary: "JVM Instance {{ $labels.instance }} memory usage > 50%"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been in status [heap usage > 50%] for more than 1 minutes. current usage ({{ $value }}%)"
# 堆空间使用超过80%
- alert: heap-usage-too-much
expr: jvm_memory_bytes_used{job="jvm-exporter", area="heap"} / jvm_memory_bytes_max * 100 > 80
for: 1m
labels:
severity: 严重
team: 运维
annotations:
summary: "JVM Instance {{ $labels.instance }} memory usage > 80%"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been in status [heap usage > 80%] for more than 1 minutes. current usage ({{ $value }}%)"
# 堆空间使用超过90%
- alert: heap-usage-too-much
expr: jvm_memory_bytes_used{job="jvm-exporter", area="heap"} / jvm_memory_bytes_max * 100 > 90
for: 1m
labels:
severity: 灾难
team: 运维
annotations:
summary: "JVM Instance {{ $labels.instance }} memory usage > 90%"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been in status [heap usage > 90%] for more than 1 minutes. current usage ({{ $value }}%)"
# 在5分钟里,Old GC花费时间超过30%
- alert: old-gc-time-too-much
expr: increase(jvm_gc_collection_seconds_sum{gc="PS MarkSweep"}[5m]) > 5 * 60 * 0.3
for: 5m
labels:
severity: 警告
team: 运维
annotations:
summary: "JVM Instance {{ $labels.instance }} Old GC time > 30% running time"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been in status [Old GC time > 30% running time] for more than 5 minutes. current seconds ({{ $value }}%)"
# 在5分钟里,Old GC花费时间超过50%
- alert: old-gc-time-too-much
expr: increase(jvm_gc_collection_seconds_sum{gc="PS MarkSweep"}[5m]) > 5 * 60 * 0.5
for: 5m
labels:
severity: 严重
team: 运维
annotations:
summary: "JVM Instance {{ $labels.instance }} Old GC time > 50% running time"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been in status [Old GC time > 50% running time] for more than 5 minutes. current seconds ({{ $value }}%)"
# 在5分钟里,Old GC花费时间超过80%
- alert: old-gc-time-too-much
expr: increase(jvm_gc_collection_seconds_sum{gc="PS MarkSweep"}[5m]) > 5 * 60 * 0.8
for: 5m
labels:
severity: 灾难
team: 运维
annotations:
summary: "JVM Instance {{ $labels.instance }} Old GC time > 80% running time"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been in status [Old GC time > 80% running time] for more than 5 minutes. current seconds ({{ $value }}%)"
# 4、准备lflk-test.json
vim /data/pkgs/yibot_lflk_tools_v4.0/prometheus/jvm-exporter/etc/conf.d/lflk-test.json
[
{
"targets": [
"x.x.x.x:30001"
],
"labels": {
"tag": "lflk",
"system": "大地调度系统",
"env": "测试环境",
"owner": "大地",
"cloud": "华为云",
"ip": "x.x.x.x",
"java": "adapter"
},
{
"targets": [
"x.x.x.x:30001"
],
"labels": {
"tag": "lflk",
"system": "大地调度系统",
"env": "测试环境",
"owner": "大地",
"cloud": "华为云",
"ip": "x.x.x.x",
"java": "watcher"
}
]
# 4、启动容器
docker run -d --name prometheus-jvm-exporter -p 9101:9090 -v /etc/localtime:/etc/localtime \
-v /data/pkgs/yibot_lflk_tools_v4.0/prometheus/jvm-exporter/etc/prometheus.yml:/etc/prometheus/prometheus.yml \
-v /data/pkgs/yibot_lflk_tools_v4.0/prometheus/jvm-exporter/etc/conf.d:/etc/prometheus/conf.d \
prom/prometheus --web.enable-lifecycle --config.file=/etc/prometheus/prometheus.yml
官方模板
https://grafana.com/grafana/dashboards/12856/revisions