基础设置
配置hosts 172.16.58.78 prome-master01 172.16.58.79 prome-node01 master上生成ssh key 并拷贝到node上 ssh-keygen ssh-copy-id prome_node_01 # 测试ssh联通 ssh prome_node_01
master安装ansible
yum install -y ansible # 关闭hostcheck vim /etc/ansible/ansible.cfg ssh_args = -o ControlMaster=auto -o ControlPersist=60s -o StrictHostKeyChecking=no playbook执行时需要设置机器文件 cat <<EOF > /opt/tgzs/host_file prome-master01 prome-node01 EOF 设置syslog 和logrotate服务 ansible-playbook -i host_file init_syslog_logrotate.yaml 编写ansible 发布服务脚本 ansible-playbook -i host_file service_deploy.yaml -e "tgz=node_exporter-1.1.2.linux-amd64.tar.gz" -e "app=node_exporter" 检查node_exporter服务状态 ansible -i host_file all -m shell -a " ps -ef |grep node_exporter|grep -v grep "
浏览器访问
IP:9100/metrics IP:9100/metrics
本机访问数据
[root@prome_master_01 tgzs]# curl -s localhost:9100/metrics |grep node_ |head -20 # HELP node_arp_entries ARP entries by device # TYPE node_arp_entries gauge node_arp_entries{device="eth0"} 3 # HELP node_boot_time_seconds Node boot time, in unixtime. # TYPE node_boot_time_seconds gauge node_boot_time_seconds 1.616987084e+09 # HELP node_context_switches_total Total number of context switches. # TYPE node_context_switches_total counter node_context_switches_total 2.105979e+06 # HELP node_cooling_device_cur_state Current throttle state of the cooling device # TYPE node_cooling_device_cur_state gauge node_cooling_device_cur_state{name="0",type="Processor"} 0 node_cooling_device_cur_state{name="1",type="Processor"} 0 node_cooling_device_cur_state{name="2",type="Processor"} 0 node_cooling_device_cur_state{name="3",type="Processor"} 0 # HELP node_cooling_device_max_state Maximum throttle state of the cooling device # TYPE node_cooling_device_max_state gauge node_cooling_device_max_state{name="0",type="Processor"} 0 node_cooling_device_max_state{name="1",type="Processor"} 0 node_cooling_device_max_state{name="2",type="Processor"} 0
默认开启的采集项目介绍
关闭默认开启采集项
--no-collector.<name> flag # 未开启前 [root@prome_master_01 node_exporter]# curl -s localhost:9100/metrics |grep node_cpu # HELP node_cpu_guest_seconds_total Seconds the CPUs spent in guests (VMs) for each mode. # TYPE node_cpu_guest_seconds_total counter node_cpu_guest_seconds_total{cpu="0",mode="nice"} 0 node_cpu_guest_seconds_total{cpu="0",mode="user"} 0 node_cpu_guest_seconds_total{cpu="1",mode="nice"} 0 node_cpu_guest_seconds_total{cpu="1",mode="user"} 0 node_cpu_guest_seconds_total{cpu="2",mode="nice"} 0 node_cpu_guest_seconds_total{cpu="2",mode="user"} 0 node_cpu_guest_seconds_total{cpu="3",mode="nice"} 0 node_cpu_guest_seconds_total{cpu="3",mode="user"} 0 # HELP node_cpu_seconds_total Seconds the CPUs spent in each mode. # TYPE node_cpu_seconds_total counter node_cpu_seconds_total{cpu="0",mode="idle"} 17691.27 node_cpu_seconds_total{cpu="0",mode="iowait"} 8.9 node_cpu_seconds_total{cpu="0",mode="irq"} 0 node_cpu_seconds_total{cpu="0",mode="nice"} 0.32 node_cpu_seconds_total{cpu="0",mode="softirq"} 0.28 node_cpu_seconds_total{cpu="0",mode="steal"} 2.7 # 关闭cpu采集 ./node_exporter --no-collector.cpu curl -s localhost:9100/metrics |grep node_cpu
关闭默认采集器项而开机器某些采集
--collector.disable-defaults --collector.<name> . # 只开启mem采集 ./node_exporter --collector.disable-defaults --collector.meminfo # 只开启mem 和cpu 采集 ./node_exporter --collector.disable-defaults --collector.meminfo --collector.cpu
默认关闭的关闭原因
- 太重
- 太慢
- 太多的开销
禁用golang sdk 指标
- 使用
--web.disable-exporter-metrics
promhttp_
代表访问/metrics
的http情况
[root@prome_master_01 tgzs]# curl -s localhost:9100/metrics |grep promhttp_ # HELP promhttp_metric_handler_errors_total Total number of internal errors encountered by the promhttp metric handler. # TYPE promhttp_metric_handler_errors_total counter promhttp_metric_handler_errors_total{cause="encoding"} 0 promhttp_metric_handler_errors_total{cause="gathering"} 0 # HELP promhttp_metric_handler_requests_in_flight Current number of scrapes being served. # TYPE promhttp_metric_handler_requests_in_flight gauge promhttp_metric_handler_requests_in_flight 1 # HELP promhttp_metric_handler_requests_total Total number of scrapes by HTTP status code. # TYPE promhttp_metric_handler_requests_total counter promhttp_metric_handler_requests_total{code="200"} 8 promhttp_metric_handler_requests_total{code="500"} 0 promhttp_metric_handler_requests_total{code="503"} 0
go_代表goruntime信息等
# HELP go_goroutines Number of goroutines that currently exist. # TYPE go_goroutines gauge go_goroutines 7 # HELP go_info Information about the Go environment. # TYPE go_info gauge go_info{version="go1.15.8"} 1 # HELP go_memstats_alloc_bytes Number of bytes allocated and still in use. # TYPE go_memstats_alloc_bytes gauge go_memstats_alloc_bytes 2.781752e+06
- process 代表进程信息等
# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds. # TYPE process_cpu_seconds_total counter process_cpu_seconds_total 0.54 # HELP process_max_fds Maximum number of open file descriptors. # TYPE process_max_fds gauge process_max_fds 1024 # HELP process_open_fds Number of open file descriptors. # TYPE process_open_fds gauge process_open_fds 9 # HELP process_resident_memory_bytes Resident memory size in bytes. # TYPE process_resident_memory_bytes gauge process_resident_memory_bytes 1.5720448e+07
节点上自打点数据上报
- 🔲--collector.textfile.directory="" 配置本地采集目录
- 🔲在采集目录里创建.prom文件
# 创建目录 mkdir ./text_file_dir # 准备 prom文件 cat <<EOF > ./text_file_dir/test.prom # HELP nyy_test_metric just test # TYPE nyy_test_metric gauge nyy_test_metric{method="post",code="200"} 1027 EOF # 启动服务 ./node_exporter --collector.textfile.directory=./text_file_dir # curl查看数据 [root@prome_master_01 tgzs]# curl -s localhost:9100/metrics |grep nyy # HELP nyy_test_metric just test # TYPE nyy_test_metric gauge nyy_test_metric{code="200",method="post"} 1027
http传入参数,按采集器过滤指标
原理:通过http请求参数过滤采集器
func (h *handler) ServeHTTP(w http.ResponseWriter, r *http.Request) { filters := r.URL.Query()["collect[]"] level.Debug(h.logger).Log("msg", "collect query:", "filters", filters) if len(filters) == 0 { // No filters, use the prepared unfiltered handler. h.unfilteredHandler.ServeHTTP(w, r) return } // To serve filtered metrics, we create a filtering handler on the fly. filteredHandler, err := h.innerHandler(filters...) if err != nil { level.Warn(h.logger).Log("msg", "Couldn't create filtered metrics handler:", "err", err) w.WriteHeader(http.StatusBadRequest) w.Write([]byte(fmt.Sprintf("Couldn't create filtered metrics handler: %s", err))) return } filteredHandler.ServeHTTP(w, r) }
http访问
# 只看cpu采集器的指标 http://IP:9100/metrics?collect[]=cpu # 只看cpu和mem采集器的指标 http://IP:9100/metrics?collect[]=cpu&collect[]=meminfo
prometheus配置
params: collect[]: - cpu - meminfo 两种导入模式 - url导入 - json文件导入
- 和prometheus `relabel_config`的区别 : `按采集器过滤 VS 按metric_name 或label过滤`
- 导入dashboard商城中的node_exporter模板
- 地址 https://grafana.com/grafana/dashboards
两种导入模式
- url导入
- json文件导入
https://grafana.com/grafana/dashboards/8919
配置数据源
验证