node_exporter 的使用(四)


1、二进制部署node exporter
在Prometheus的架构设计中,Prometheus server并不直接监控特定的目标,其主要任务负责数据的收集、存储并且对外提供数据查询支持。因此为了能够监控到某些指标,如主机的CPU使用率、内存、磁盘,我们需要使用到exporter,Prometheus周期性的从exporter暴露的HTTP服务地址(通常是/metrics)拉取监控样本数据.

[root@prometheus ~]# wget https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-amd64.tar.gz
[root@prometheus ~]# tar xf node_exporter-1.0.1.linux-amd64.tar.gz -C /usr/local/src/
[root@prometheus ~]# ln -s /usr/local/src/node_exporter-1.0.1.linux-amd64 /usr/local/node_exporter
cat >/usr/lib/systemd/system/node-exporter.service <<EOF
[Unit]
Description=Prometheus Server
After=network.target
[Service]
ExecStart=/usr/local/node_exporter/node_exporter
[Install]
WantedBy=multi-user.target
EOF
[root@prometheus ~]# systemctl daemon-reload && systemctl enable node-exporter.service && systemctl start node-exporter.service
Created symlink from /etc/systemd/system/multi-user.target.wants/node-exporter.service to /usr/lib/systemd/system/node-exporter.service.
[root@prometheus ~]# ss -lnpt|grep node_exporter
LISTEN     0      128       [::]:9100                  [::]:*                   users:(("node_exporter",pid=11256,fd=3))

查看监控指标,仅截取了一部分

[root@prometheus ~]# curl http://10.0.0.51:9100/metrics
# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
# TYPE go_gc_duration_seconds summary
go_gc_duration_seconds{quantile="0"} 0
go_gc_duration_seconds{quantile="0.25"} 0
go_gc_duration_seconds{quantile="0.5"} 0
go_gc_duration_seconds{quantile="0.75"} 0
go_gc_duration_seconds{quantile="1"} 0
go_gc_duration_seconds_sum 0
go_gc_duration_seconds_count 0
# HELP go_goroutines Number of goroutines that currently exist.
# TYPE go_goroutines gauge
go_goroutines 9
# HELP go_info Information about the Go environment.
# TYPE go_info gauge
go_info{version="go1.14.4"} 1
# HELP go_memstats_alloc_bytes Number of bytes allocated and still in use.
# TYPE go_memstats_alloc_bytes gauge
go_memstats_alloc_bytes 3.495864e+06

2、在Prometheus的server端添加job任务

[root@prometheus ~]# vim /usr/local/prometheus/prometheus.yml 
  - job_name: 'node_exporter'
    static_configs:
    - targets: [ 10.0.0.51:9100 ]
[root@prometheus ~]# systemctl restart prometheus

通过浏览器访问 http://10.0.0.51:9090/targets

扩展(批量添加节点)
在添加大量主机集群时,一台一台在Prometheus.yml中添加,显然不太方便,我们通过编写发现文件,进行批量主机管理
编写自动发现文件
[root@prometheus ~]# mkdir -p /usr/local/prometheus/targets/
[root@prometheus ~]# vim /usr/local/prometheus/targets/node.yml

- targets: 
  - '172.16.214.141:9100'
  - '172.16.214.140:9100'
  - '172.16.214.139:9100'
  labels:
    idc: "bj"               # 备注集群名

修改prometheus的配置文件

[root@prometheus ~]# vim /usr/local/prometheus/prometheus.yml
  - job_name: 'nm_mch-app_node'
    file_sd_configs:
      - files: ['/usr/local/prometheus/targets/node.yml']
        refresh_interval: 5s

3、grafana导入node_exporter的监控图表
https://grafana.com/grafana/dashboards/8919

4、配置node_exporter的告警规则

[root@prometheus ~]# cd /usr/local/prometheus/
[root@prometheus Prometheus]# mkdir rules
cat > /usr/local/prometheus/rules/node_rules.yml << 'EOF'
groups:
- name: Hosts.rules
  rules:
  - alert: HostDown
    expr: up{job=~"node_exporter|prometheus|grafana|alertmanager"} == 0
    for: 0m
    labels:
      severity: critical
    annotations:
      title: 'Instance down'
      description: "主机: 【{{ $labels.instance }}】has been down for more than 1 minute"

  - alert: HostCpuLoadAvage
    expr: sum(node_load5) by (instance) > 10
    for: 1m
    annotations:
      title: "5分钟内CPU负载过高"
      description: "主机: 【{{ $labels.instance }}】 5五分钟内CPU负载超过10 (当前值:{{ $value }})"
    labels:
      severity: 'warning'

  - alert: HostCpuUsage
    expr: (1-((sum(increase(node_cpu_seconds_total{mode="idle"}[5m])) by (instance))/ (sum(increase(node_cpu_seconds_total[5m])) by (instance))))*100 > 80
    for: 1m
    annotations:
      title: "CPU使用率过高"
      description: "主机: 【{{ $labels.instance }}】 5五分钟内CPU使用率超过80% (当前值:{{ $value }})"
    labels:
      severity: 'warning'

  - alert: HostMemoryUsage
    expr: (1-((node_memory_Buffers_bytes + node_memory_Cached_bytes + node_memory_MemFree_bytes)/node_memory_MemTotal_bytes))*100 > 80
    for: 1m
    annotations:
      title: "主机内存使用率超过80%"
      description: "主机: 【{{ $labels.instance }}】 内存使用率超过80% (当前使用率:{{ $value }}%)"
    labels:
      severity: 'warning'

  - alert: HostIOWait
    expr: ((sum(increase(node_cpu_seconds_total{mode="iowait"}[5m])) by (instance))/(sum(increase(node_cpu_seconds_total[5m])) by (instance)))*100 > 10
    for: 1m
    annotations:
      title: "磁盘负载过高"
      description: "主机: 【{{ $labels.instance }}】 5五分钟内磁盘负载过高 (当前负载值:{{ $value }})"
    labels:
      severity: 'warning'

  - alert: HostFileSystemUsage
    expr: (1-(node_filesystem_free_bytes{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" }/node_filesystem_size_bytes{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" }))*100 > 70
    for: 1m
    annotations:
      title: "磁盘空间剩余不足"
      description: "主机: 【{{ $labels.instance }}】 {{ $labels.mountpoint }}分区使用率超过70%, 当前值使用率:{{ $value }}%"
    labels:
      severity: 'warning'

  - alert: HostSwapIsFillingUp
    expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
    for: 2m
    labels:
      severity: 'warning'
    annotations:
      title: "主机swap分区不足"
      description: "主机: 【{{ $labels.instance }}】 swap分区使用超过 (>80%), 当前值使用率: {{ $value }}%"

  - alert: HostNetworkConnection-ESTABLISHED
    expr:  sum(node_netstat_Tcp_CurrEstab) by (instance) > 1000
    for: 5m
    labels:
      severity: 'warning'
    annotations:
      title: "主机ESTABLISHED连接数过高"
      description: "主机: 【{{ $labels.instance }}】 ESTABLISHED连接数超过1000, 当前ESTABLISHED连接数: {{ $value }}"

  - alert: HostNetworkConnection-TIME_WAIT
    expr:  sum(node_sockstat_TCP_tw) by (instance) > 1000
    for: 5m
    labels:
      severity: 'warning'
    annotations:
      title: "主机TIME_WAIT连接数过高"
      description: "主机: 【{{ $labels.instance }}】 TIME_WAIT连接数超过1000, 当前TIME_WAIT连接数: {{ $value }}"

  - alert: HostUnusualNetworkThroughputIn
    expr:  sum by (instance, device) (rate(node_network_receive_bytes_total{device=~"ens.*"}[2m])) / 1024 / 1024 > 100
    for: 5m
    labels:
      severity: 'warning'
    annotations:
      title: "主机网卡入口流量过高"
      description: "主机: 【{{ $labels.instance }}】, 网卡: {{ $labels.device }} 入口流量超过 (> 100 MB/s), 当前值: {{ $value }}"

  - alert: HostUnusualNetworkThroughputOut
    expr: sum by (instance, device) (rate(node_network_transmit_bytes_total{device=~"ens.*"}[2m])) / 1024 / 1024 > 100
    for: 5m
    labels:
      severity: 'warning'
    annotations:
      title: "主机网卡出口流量过高"
      description: "主机: 【{{ $labels.instance }}】, 网卡: {{ $labels.device }} 出口流量超过 (> 100 MB/s), 当前值: {{ $value }}"

  - alert: HostUnusualDiskReadRate
    expr: sum by (instance, device) (rate(node_disk_read_bytes_total{device=~"sd.*"}[2m])) / 1024 / 1024 > 50
    for: 5m
    labels:
      severity: 'warning'
    annotations:
      title: "主机磁盘读取速率过高"
      description: "主机: 【{{ $labels.instance }}】, 磁盘: {{ $labels.device }} 读取速度超过(50 MB/s), 当前值: {{ $value }}"

  - alert: HostUnusualDiskWriteRate
    expr: sum by (instance, device) (rate(node_disk_written_bytes_total{device=~"sd.*"}[2m])) / 1024 / 1024 > 50
    for: 2m
    labels:
      severity: 'warning'
    annotations:
      title: "主机磁盘写入速率过高"
      description: "主机: 【{{ $labels.instance }}】, 磁盘: {{ $labels.device }} 写入速度超过(50 MB/s), 当前值: {{ $value }}"

  - alert: HostOutOfInodes
    expr: node_filesystem_files_free{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" } / node_filesystem_files{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" } * 100 < 10
    for: 2m
    labels:
      severity: 'warning'
    annotations:
      title: "主机分区Inode节点不足"
      description: "主机: 【{{ $labels.instance }}】 {{ $labels.mountpoint }}分区inode节点不足 (可用值小于{{ $value }}%)"

  - alert: HostUnusualDiskReadLatency
    expr: rate(node_disk_read_time_seconds_total{device=~"sd.*"}[1m]) / rate(node_disk_reads_completed_total{device=~"sd.*"}[1m]) > 0.1 and rate(node_disk_reads_completed_total{device=~"sd.*"}[1m]) > 0
    for: 2m
    labels:
      severity: 'warning'
    annotations:
      title: "主机磁盘Read延迟过高"
      description: "主机: 【{{ $labels.instance }}】, 磁盘: {{ $labels.device }} Read延迟过高 (read operations > 100ms), 当前延迟值: {{ $value }}ms"

  - alert: HostUnusualDiskWriteLatency
    expr: rate(node_disk_write_time_seconds_total{device=~"sd.*"}[1m]) / rate(node_disk_writes_completed_total{device=~"sd.*"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{device=~"sd.*"}[1m]) > 0
    for: 2m
    labels:
      severity: 'warning'
    annotations:
      title: "主机磁盘Write延迟过高"
      description: "主机: 【{{ $labels.instance }}】, 磁盘: {{ $labels.device }} Write延迟过高 (write operations > 100ms), 当前延迟值: {{ $value }}ms"
EOF

修改Prometheus的配置文件

rule_files:
  - "rules/node_rules.yml"
或者写全路径:/usr/local/prometheus/rules/node_rules.yml
[root@prometheus prometheus]# systemctl restart prometheus

在Prometheus的界面可以看到配置的规则


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM